-
Notifications
You must be signed in to change notification settings - Fork 148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CB improvements #769
base: master
Are you sure you want to change the base?
CB improvements #769
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -38,6 +38,7 @@ enum class GenerationFinishReason { | |||||
LENGTH = 2 // Generation finished by reaching max_new_tokens limit | ||||||
}; | ||||||
|
||||||
// Output of generate() method, which represents full information about request with a given request_id | ||||||
struct GenerationResult { | ||||||
// request ID - obsolete when handle API is approved as handle will connect results with prompts. | ||||||
uint64_t m_request_id; | ||||||
|
@@ -52,37 +53,53 @@ struct GenerationResult { | |||||
GenerationStatus m_status = GenerationStatus::RUNNING; | ||||||
}; | ||||||
|
||||||
// Represents already generated tokens of running generate() method. | ||||||
// E.g. typically generate() method consists of multiple step() which generate | ||||||
// token by token. This structure represents a vector of already generated tokens so far | ||||||
// for a given prompt. | ||||||
struct GenerationOutput { | ||||||
// Currently generated list of tokens | ||||||
std::vector<int64_t> generated_token_ids; | ||||||
// Score | ||||||
// For beam search case: beam score | ||||||
// For other sampling types: cumulative log probabilitity of output tokens | ||||||
float score; | ||||||
// Finish reason if generation has finished, NONE otherwise | ||||||
GenerationFinishReason finish_reason; | ||||||
}; | ||||||
|
||||||
// Current outputs of step() method for all scheduled requests | ||||||
using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>; | ||||||
|
||||||
class GenerationStream; | ||||||
|
||||||
class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { | ||||||
class OPENVINO_GENAI_EXPORTS GenerationHandle { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this change? |
||||||
std::shared_ptr<GenerationStream> m_generation_stream; | ||||||
ov::genai::GenerationConfig m_sampling_params; | ||||||
|
||||||
bool is_dropped(); | ||||||
// whether client ha dropped session with pipeline | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
bool is_dropped() const; | ||||||
|
||||||
public: | ||||||
GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) : | ||||||
m_generation_stream(std::move(generation_stream)), | ||||||
m_sampling_params(sampling_params) {}; | ||||||
using Ptr = std::shared_ptr<GenerationHandle>; | ||||||
|
||||||
~GenerationHandleImpl(); | ||||||
GenerationHandle(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) : | ||||||
m_generation_stream(std::move(generation_stream)), | ||||||
m_sampling_params(sampling_params) { | ||||||
} | ||||||
|
||||||
~GenerationHandle(); | ||||||
|
||||||
// There can be only one handle for a request | ||||||
GenerationHandleImpl(const GenerationHandleImpl&) = delete; | ||||||
GenerationHandleImpl& operator=(const GenerationHandleImpl&) = delete; | ||||||
GenerationHandle(const GenerationHandle&) = delete; | ||||||
GenerationHandle& operator=(const GenerationHandle&) = delete; | ||||||
|
||||||
GenerationStatus get_status(); | ||||||
GenerationStatus get_status() const; | ||||||
|
||||||
bool can_read(); | ||||||
// whether new tokens are available | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
bool can_read() const; | ||||||
|
||||||
// client drops generation session on server | ||||||
void drop(); | ||||||
|
||||||
GenerationOutputs back(); | ||||||
|
@@ -92,5 +109,4 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { | |||||
std::vector<GenerationOutput> read_all(); | ||||||
}; | ||||||
|
||||||
using GenerationHandle = std::shared_ptr<GenerationHandleImpl>; | ||||||
} |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -9,34 +9,40 @@ namespace ov::genai { | |||||||||||||||||||||||||
struct SchedulerConfig { | ||||||||||||||||||||||||||
// a maximum number of tokens to batch | ||||||||||||||||||||||||||
// (in constrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch) | ||||||||||||||||||||||||||
// TODO: benchmark this value and understand a required value to ensure inference is not memory bound | ||||||||||||||||||||||||||
std::size_t max_num_batched_tokens = 256; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// total number of KV blocks available to scheduler logic | ||||||||||||||||||||||||||
// Note, if it's set to 0, then `cache_size` must be specified | ||||||||||||||||||||||||||
std::size_t num_kv_blocks = 0; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// total size of KV cache in GB | ||||||||||||||||||||||||||
// Note, if it's set to 0, then `num_kv_blocks` must be specified | ||||||||||||||||||||||||||
std::size_t cache_size = 1; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// block size for KV cache | ||||||||||||||||||||||||||
std::size_t block_size = 32; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// whether to split prompt / generate to different scheduling phases | ||||||||||||||||||||||||||
// - Dynamic split fuse schdules requests in generation phase first, then | ||||||||||||||||||||||||||
// schdules requests in prompt phase. If request cannot be fully fit into | ||||||||||||||||||||||||||
// remaining space of 'max_num_batched_tokens' group, it's scheduled only partially | ||||||||||||||||||||||||||
// and other tokens can be scheduled only next iterations | ||||||||||||||||||||||||||
// - vLLM mode priorities requests in prompt phase over requests on generation phase | ||||||||||||||||||||||||||
Comment on lines
25
to
+30
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||||
bool dynamic_split_fuse = true; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// Enable caching of KV-blocks. | ||||||||||||||||||||||||||
// When turned on all previously calculated KV-caches are kept in memory for future usages. | ||||||||||||||||||||||||||
// KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. | ||||||||||||||||||||||||||
// This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. | ||||||||||||||||||||||||||
// When turned off, only KV-cache required for batch calculation is kept in memory and | ||||||||||||||||||||||||||
// when a sequence has finished genegartion its KV cache blocks are released. | ||||||||||||||||||||||||||
bool enable_prefix_caching = false; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||
// vLLM-like settings | ||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// max number of scheduled sequences (you can think of it as "max batch size") | ||||||||||||||||||||||||||
std::size_t max_num_seqs = 256; | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
// Enable caching of KV-blocks. | ||||||||||||||||||||||||||
// When turned on all previously calculated KV-caches are kept in memory for future usages. | ||||||||||||||||||||||||||
// KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. | ||||||||||||||||||||||||||
// This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. | ||||||||||||||||||||||||||
// When turend off only KV-cache required for batch calculation is kept in memory and | ||||||||||||||||||||||||||
// when a sequence has finished genegartion its cache is released. | ||||||||||||||||||||||||||
bool enable_prefix_caching = false; | ||||||||||||||||||||||||||
}; | ||||||||||||||||||||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's not exactly true. I would rephrase that to: "... a vector of tokens generated since the last read...".
To be more specific it's always vector of one element if N == 1 and we are not using beam search.
For N > 1 and/or beam search is used vector contains all generated tokens.