Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

causal_lm: add TextStreamer #130

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions text_generation/causal_lm/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,13 @@ This pipeline can work with other similar topologies produced by `optimum-intel`
### Download and convert the model and tokenizers

The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
`beam_search_causal_lm` requires ommiting `--streaming-detokenizer` for `convert_tokenizers.py`.

```sh
source <INSTALL_DIR>/setupvars.sh
python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
python -m pip uninstall openvino # Uninstall openvino from PyPI because there's one from the archive installed
python ../../../llm_bench/python/convert.py --model_id meta-llama/Llama-2-7b-hf --output_dir ./Llama-2-7b-hf/ --precision FP16 --stateful
convert_tokenizer ./Llama-2-7b-hf/pytorch/dldt/FP16/ --output ./Llama-2-7b-hf/pytorch/dldt/FP16/ --with-detokenizer --streaming-detokenizer --trust-remote-code
convert_tokenizer ./Llama-2-7b-hf/pytorch/dldt/FP16/ --output ./Llama-2-7b-hf/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
```

## Run
Expand Down
47 changes: 40 additions & 7 deletions text_generation/causal_lm/cpp/greedy_causal_lm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,46 @@ std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::str
return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
}

void print_token(ov::InferRequest& detokenizer, int64_t out_token) {
std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t>& tokens) {
constexpr size_t BATCH_SIZE = 1;
ov::Tensor inp = detokenizer.get_input_tensor();
inp.set_shape({BATCH_SIZE, 1});
inp.data<int64_t>()[0] = out_token;
detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()});
detokenizer.infer();
std::cout << detokenizer.get_output_tensor().data<std::string>()[0] << std::flush;
return detokenizer.get_output_tensor().data<std::string>()[0];
}

// The following reasons require TextStreamer to keep cache of previous tokens:
// Detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
// but detokenize(tokenize("prefix a")) == "prefix a"
// One printable token may consist of 2 token ids: detokenize(incomplete_token_id) == "�"
struct TextStreamer {
ov::InferRequest detokenizer;
std::vector<int64_t> token_cache;
size_t print_len = 0;

void put(int64_t token) {
token_cache.push_back(token);
std::string text = detokenize(detokenizer, token_cache);
if (!text.empty() && '\n' == text.back()) {
// Flush the cache after the new line symbol
std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
token_cache.clear();
print_len = 0;
}
if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
// Don't print incomplete text
return;
}
std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
print_len = text.size();
}

void end() {
std::string text = detokenize(detokenizer, token_cache);
std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
token_cache.clear();
print_len = 0;
}
};
}

int main(int argc, char* argv[]) try {
Expand Down Expand Up @@ -51,6 +83,7 @@ int main(int argc, char* argv[]) try {

lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
position_ids.set_shape({BATCH_SIZE, 1});
TextStreamer text_streamer{std::move(detokenizer)};
// There's no way to extract special token values from the detokenizer for now
constexpr int64_t SPECIAL_EOS_TOKEN = 2;
while (out_token != SPECIAL_EOS_TOKEN) {
Expand All @@ -59,12 +92,12 @@ int main(int argc, char* argv[]) try {
std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), lm.get_tensor("attention_mask").get_size(), 1);
position_ids.data<int64_t>()[0] = int64_t(lm.get_tensor("attention_mask").get_size() - 2);
lm.start_async();
print_token(detokenizer, out_token);
text_streamer.put(out_token);
lm.wait();
logits = lm.get_tensor("logits").data<float>();
out_token = std::max_element(logits, logits + vocab_size) - logits;
}
std::cout << '\n';
text_streamer.end();
} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
return 1;
Expand Down
2 changes: 1 addition & 1 deletion text_generation/causal_lm/cpp/set_up_and_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
wait

convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer --streaming-detokenizer
convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer
./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/ "return 0"