Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CB improvements #769

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,14 @@ class GenerationInfo {
size_t num_input_tokens;
};

ov::genai::GenerationHandle generation_handle;
ov::genai::GenerationHandle::Ptr generation_handle;
std::chrono::steady_clock::time_point start_time;
std::unordered_map<int64_t, SequenceInfo> sequences_info;
bool active = true;
size_t input_len;

public:
GenerationInfo(ov::genai::GenerationHandle generation_handle, size_t input_len) : input_len(input_len)
GenerationInfo(ov::genai::GenerationHandle::Ptr generation_handle, size_t input_len) : input_len(input_len)
{
this->generation_handle = std::move(generation_handle);
start_time = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -253,7 +253,7 @@ class GenerationInfoCollector {
}

void add_generation(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id) {
ov::genai::GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]);
ov::genai::GenerationHandle::Ptr generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]);
std::lock_guard<std::mutex> lock(mutex);
generations_info.emplace_back(std::move(generation_handle), dataset->m_input_lens[request_id]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {

PipelineMetrics get_metrics() const;

GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
GenerationHandle::Ptr add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
GenerationHandle::Ptr add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);

void step();

Expand Down
38 changes: 27 additions & 11 deletions src/cpp/include/openvino/genai/generation_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ enum class GenerationFinishReason {
LENGTH = 2 // Generation finished by reaching max_new_tokens limit
};

// Output of generate() method, which represents full information about request with a given request_id
struct GenerationResult {
// request ID - obsolete when handle API is approved as handle will connect results with prompts.
uint64_t m_request_id;
Expand All @@ -52,37 +53,53 @@ struct GenerationResult {
GenerationStatus m_status = GenerationStatus::RUNNING;
};

// Represents already generated tokens of running generate() method.
// E.g. typically generate() method consists of multiple step() which generate
// token by token. This structure represents a vector of already generated tokens so far
// for a given prompt.
Comment on lines +58 to +59
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not exactly true. I would rephrase that to: "... a vector of tokens generated since the last read...".
To be more specific it's always vector of one element if N == 1 and we are not using beam search.
For N > 1 and/or beam search is used vector contains all generated tokens.

struct GenerationOutput {
// Currently generated list of tokens
std::vector<int64_t> generated_token_ids;
// Score
// For beam search case: beam score
// For other sampling types: cumulative log probabilitity of output tokens
float score;
// Finish reason if generation has finished, NONE otherwise
GenerationFinishReason finish_reason;
};

// Current outputs of step() method for all scheduled requests
using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>;

class GenerationStream;

class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
class OPENVINO_GENAI_EXPORTS GenerationHandle {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this change?

std::shared_ptr<GenerationStream> m_generation_stream;
ov::genai::GenerationConfig m_sampling_params;

bool is_dropped();
// whether client ha dropped session with pipeline
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// whether client ha dropped session with pipeline
// whether client has dropped session with pipeline

bool is_dropped() const;

public:
GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
m_generation_stream(std::move(generation_stream)),
m_sampling_params(sampling_params) {};
using Ptr = std::shared_ptr<GenerationHandle>;

~GenerationHandleImpl();
GenerationHandle(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
m_generation_stream(std::move(generation_stream)),
m_sampling_params(sampling_params) {
}

~GenerationHandle();

// There can be only one handle for a request
GenerationHandleImpl(const GenerationHandleImpl&) = delete;
GenerationHandleImpl& operator=(const GenerationHandleImpl&) = delete;
GenerationHandle(const GenerationHandle&) = delete;
GenerationHandle& operator=(const GenerationHandle&) = delete;

GenerationStatus get_status();
GenerationStatus get_status() const;

bool can_read();
// whether new tokens are available
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// whether new tokens are available
// whether read() is possible (new tokens are available and handle has not been dropped)

bool can_read() const;

// client drops generation session on server
void drop();

GenerationOutputs back();
Expand All @@ -92,5 +109,4 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
std::vector<GenerationOutput> read_all();
};

using GenerationHandle = std::shared_ptr<GenerationHandleImpl>;
}
24 changes: 15 additions & 9 deletions src/cpp/include/openvino/genai/scheduler_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,40 @@ namespace ov::genai {
struct SchedulerConfig {
// a maximum number of tokens to batch
// (in constrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch)
// TODO: benchmark this value and understand a required value to ensure inference is not memory bound
std::size_t max_num_batched_tokens = 256;

// total number of KV blocks available to scheduler logic
// Note, if it's set to 0, then `cache_size` must be specified
std::size_t num_kv_blocks = 0;

// total size of KV cache in GB
// Note, if it's set to 0, then `num_kv_blocks` must be specified
std::size_t cache_size = 1;

// block size for KV cache
std::size_t block_size = 32;

// whether to split prompt / generate to different scheduling phases
// - Dynamic split fuse schdules requests in generation phase first, then
// schdules requests in prompt phase. If request cannot be fully fit into
// remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
// and other tokens can be scheduled only next iterations
// - vLLM mode priorities requests in prompt phase over requests on generation phase
Comment on lines 25 to +30
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// whether to split prompt / generate to different scheduling phases
// - Dynamic split fuse schdules requests in generation phase first, then
// schdules requests in prompt phase. If request cannot be fully fit into
// remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
// and other tokens can be scheduled only next iterations
// - vLLM mode priorities requests in prompt phase over requests on generation phase
// whether to split prompt / generate to different scheduling phases
// - Dynamic split fuse schedules requests in generation phase first, then
// schedules requests in prompt phase. If request cannot be fully fit into
// remaining space of 'max_num_batched_tokens' group, it's scheduled only partially
// and other tokens can be scheduled in next iterations
// - vLLM mode prioritizes requests in prompt phase over requests in generation phase

bool dynamic_split_fuse = true;

// Enable caching of KV-blocks.
// When turned on all previously calculated KV-caches are kept in memory for future usages.
// KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released.
// This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters.
// When turned off, only KV-cache required for batch calculation is kept in memory and
// when a sequence has finished genegartion its KV cache blocks are released.
bool enable_prefix_caching = false;

//
// vLLM-like settings
//

// max number of scheduled sequences (you can think of it as "max batch size")
std::size_t max_num_seqs = 256;

// Enable caching of KV-blocks.
// When turned on all previously calculated KV-caches are kept in memory for future usages.
// KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released.
// This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters.
// When turend off only KV-cache required for batch calculation is kept in memory and
// when a sequence has finished genegartion its cache is released.
bool enable_prefix_caching = false;
};
}
Loading
Loading