openvinotoolkit · ilya-lavrenov · Aug 13, 2024 · mzegla · Aug 14, 2024 · mzegla
diff --git a/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp
@@ -181,14 +181,14 @@ class GenerationInfo {
         size_t num_input_tokens;
     };
 
-    ov::genai::GenerationHandle generation_handle;
+    ov::genai::GenerationHandle::Ptr generation_handle;
     std::chrono::steady_clock::time_point start_time;
     std::unordered_map<int64_t, SequenceInfo> sequences_info;
     bool active = true;
     size_t input_len;
 
 public:
-    GenerationInfo(ov::genai::GenerationHandle generation_handle, size_t input_len) : input_len(input_len)
+    GenerationInfo(ov::genai::GenerationHandle::Ptr generation_handle, size_t input_len) : input_len(input_len)
     {
         this->generation_handle = std::move(generation_handle);
         start_time = std::chrono::steady_clock::now();
@@ -253,7 +253,7 @@ class GenerationInfoCollector {
     }
 
     void add_generation(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id) {
-        ov::genai::GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]);
+        ov::genai::GenerationHandle::Ptr generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]);
         std::lock_guard<std::mutex> lock(mutex);
         generations_info.emplace_back(std::move(generation_handle), dataset->m_input_lens[request_id]);
     }

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -58,8 +58,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 
     PipelineMetrics get_metrics() const;
 
-    GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
-    GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
+    GenerationHandle::Ptr add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
+    GenerationHandle::Ptr add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
 
     void step();
 

diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -38,6 +38,7 @@ enum class GenerationFinishReason {
     LENGTH = 2 // Generation finished by reaching max_new_tokens limit
 };
 
+// Output of generate() method, which represents full information about request with a given request_id
 struct GenerationResult {
     // request ID - obsolete when handle API is approved as handle will connect results with prompts.
     uint64_t m_request_id;
@@ -52,37 +53,53 @@ struct GenerationResult {
     GenerationStatus m_status = GenerationStatus::RUNNING;
 };
 
+// Represents already generated tokens of running generate() method.
+// E.g. typically generate() method consists of multiple step() which generate
+// token by token. This structure represents a vector of already generated tokens so far
+// for a given prompt.
 struct GenerationOutput {
+    // Currently generated list of tokens
     std::vector<int64_t> generated_token_ids;
+    // Score
+    // For beam search case: beam score
+    // For other sampling types: cumulative log probabilitity of output tokens
     float score;
+    // Finish reason if generation has finished, NONE otherwise
     GenerationFinishReason finish_reason;
 };
 
+// Current outputs of step() method for all scheduled requests
 using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>;
 
 class GenerationStream;
 
-class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
+class OPENVINO_GENAI_EXPORTS GenerationHandle {
     std::shared_ptr<GenerationStream> m_generation_stream;
     ov::genai::GenerationConfig m_sampling_params;
 
-    bool is_dropped();
+    // whether client ha dropped session with pipeline
-    // whether client ha dropped session with pipeline
+    // whether client has dropped session with pipeline
-    // whether client ha dropped session with pipeline
+    // whether client has dropped session with pipeline
+    bool is_dropped() const;
 
 public:
-    GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
-    m_generation_stream(std::move(generation_stream)),
-    m_sampling_params(sampling_params) {};
+    using Ptr = std::shared_ptr<GenerationHandle>;
 
-    ~GenerationHandleImpl();
+    GenerationHandle(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
+        m_generation_stream(std::move(generation_stream)),
+        m_sampling_params(sampling_params) {
+    }
+
+    ~GenerationHandle();
 
     // There can be only one handle for a request
-    GenerationHandleImpl(const GenerationHandleImpl&) = delete;
-    GenerationHandleImpl& operator=(const GenerationHandleImpl&) = delete;
+    GenerationHandle(const GenerationHandle&) = delete;
+    GenerationHandle& operator=(const GenerationHandle&) = delete;
 
-    GenerationStatus get_status();
+    GenerationStatus get_status() const;
 
-    bool can_read();
+    // whether new tokens are available
-    // whether new tokens are available
+    // whether read() is possible (new tokens are available and handle has not been dropped)
-    // whether new tokens are available
+    // whether read() is possible (new tokens are available and handle has not been dropped)
+    bool can_read() const;
 
+    // client drops generation session on server
     void drop();
 
     GenerationOutputs back();
@@ -92,5 +109,4 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
     std::vector<GenerationOutput> read_all();
 };
 
-using GenerationHandle = std::shared_ptr<GenerationHandleImpl>;
 }
diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp
@@ -9,34 +9,40 @@ namespace ov::genai {
 struct SchedulerConfig {
     // a maximum number of tokens to batch
     // (in constrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch)
-    // TODO: benchmark this value and understand a required value to ensure inference is not memory bound
     std::size_t max_num_batched_tokens = 256;
 
     // total number of KV blocks available to scheduler logic
+    // Note, if it's set to 0, then `cache_size` must be specified
     std::size_t num_kv_blocks = 0;
 
     // total size of KV cache in GB
+    // Note, if it's set to 0, then `num_kv_blocks` must be specified
     std::size_t cache_size = 1;
 
     // block size for KV cache
     std::size_t block_size = 32;
 
     // whether to split prompt / generate to different scheduling phases
+    // - Dynamic split fuse schdules requests in generation phase first, then
+    // schdules requests in prompt phase. If request cannot be fully fit into
+    // remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
+    // and other tokens can be scheduled only next iterations
+    // - vLLM mode priorities requests in prompt phase over requests on generation phase
-    // whether to split prompt / generate to different scheduling phases
-    // - Dynamic split fuse schdules requests in generation phase first, then
-    // schdules requests in prompt phase. If request cannot be fully fit into
-    // remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
-    // and other tokens can be scheduled only next iterations
-    // - vLLM mode priorities requests in prompt phase over requests on generation phase
+    // whether to split prompt / generate to different scheduling phases
+    // - Dynamic split fuse schedules requests in generation phase first, then
+    // schedules requests in prompt phase. If request cannot be fully fit into
+    // remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
+    // and other tokens can be scheduled in next iterations
+    // - vLLM mode prioritizes requests in prompt phase over requests in generation phase
-    // whether to split prompt / generate to different scheduling phases
-    // - Dynamic split fuse schdules requests in generation phase first, then
-    // schdules requests in prompt phase. If request cannot be fully fit into
-    // remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
-    // and other tokens can be scheduled only next iterations
-    // - vLLM mode priorities requests in prompt phase over requests on generation phase
+    // whether to split prompt / generate to different scheduling phases
+    // - Dynamic split fuse schedules requests in generation phase first, then
+    // schedules requests in prompt phase. If request cannot be fully fit into
+    // remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
+    // and other tokens can be scheduled in next iterations
+    // - vLLM mode prioritizes requests in prompt phase over requests in generation phase
     bool dynamic_split_fuse = true;
 
+    // Enable caching of KV-blocks.
+    // When turned on all previously calculated KV-caches are kept in memory for future usages.
+    // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released.
+    // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters.
+    // When turned off, only KV-cache required for batch calculation is kept in memory and
+    // when a sequence has finished genegartion its KV cache blocks are released.
+    bool enable_prefix_caching = false;
+
     //
     // vLLM-like settings
     //
 
     // max number of scheduled sequences (you can think of it as "max batch size")
     std::size_t max_num_seqs = 256;
-
-    // Enable caching of KV-blocks.
-    // When turned on all previously calculated KV-caches are kept in memory for future usages.
-    // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released.
-    // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. 
-    // When turend off only KV-cache required for batch calculation is kept in memory and 
-    // when a sequence has finished genegartion its cache is released.
-    bool enable_prefix_caching = false;
 };
 }