openvinotoolkit · ilya-lavrenov · Jan 8, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 8, 2024
diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
@@ -62,14 +62,13 @@ This pipeline can work with other similar topologies produced by `optimum-intel`
 ### Download and convert the model and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
-`beam_search_causal_lm` requires ommiting `--streaming-detokenizer` for `convert_tokenizers.py`.
 
 ```sh
 source <INSTALL_DIR>/setupvars.sh
 python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
 python -m pip uninstall openvino  # Uninstall openvino from PyPI because there's one from the archive installed
 python ../../../llm_bench/python/convert.py --model_id meta-llama/Llama-2-7b-hf --output_dir ./Llama-2-7b-hf/ --precision FP16 --stateful
-convert_tokenizer ./Llama-2-7b-hf/pytorch/dldt/FP16/ --output ./Llama-2-7b-hf/pytorch/dldt/FP16/ --with-detokenizer --streaming-detokenizer --trust-remote-code
+convert_tokenizer ./Llama-2-7b-hf/pytorch/dldt/FP16/ --output ./Llama-2-7b-hf/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
 ```
 
 ## Run

diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
@@ -11,14 +11,46 @@ std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::str
     return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
 }
 
-void print_token(ov::InferRequest& detokenizer, int64_t out_token) {
+std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t>& tokens) {
     constexpr size_t BATCH_SIZE = 1;
-    ov::Tensor inp = detokenizer.get_input_tensor();
-    inp.set_shape({BATCH_SIZE, 1});
-    inp.data<int64_t>()[0] = out_token;
+    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()});
     detokenizer.infer();
-    std::cout << detokenizer.get_output_tensor().data<std::string>()[0] << std::flush;
+    return detokenizer.get_output_tensor().data<std::string>()[0];
 }
+
+// The following reasons require TextStreamer to keep cache of previous tokens:
+// Detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
+// but detokenize(tokenize("prefix a")) == "prefix a"
+// One printable token may consist of 2 token ids: detokenize(incomplete_token_id) == "�"
+struct TextStreamer {
+    ov::InferRequest detokenizer;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = detokenize(detokenizer, token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = detokenize(detokenizer, token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
 }
 
 int main(int argc, char* argv[]) try {
@@ -51,6 +83,7 @@ int main(int argc, char* argv[]) try {
 
     lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
     position_ids.set_shape({BATCH_SIZE, 1});
+    TextStreamer text_streamer{std::move(detokenizer)};
     // There's no way to extract special token values from the detokenizer for now
     constexpr int64_t SPECIAL_EOS_TOKEN = 2;
     while (out_token != SPECIAL_EOS_TOKEN) {
@@ -59,12 +92,12 @@ int main(int argc, char* argv[]) try {
         std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), lm.get_tensor("attention_mask").get_size(), 1);
         position_ids.data<int64_t>()[0] = int64_t(lm.get_tensor("attention_mask").get_size() - 2);
         lm.start_async();
-        print_token(detokenizer, out_token);
+        text_streamer.put(out_token);
         lm.wait();
         logits = lm.get_tensor("logits").data<float>();
         out_token = std::max_element(logits, logits + vocab_size) - logits;
     }
-    std::cout << '\n';
+    text_streamer.end();
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return 1;

diff --git a/text_generation/causal_lm/cpp/set_up_and_run.sh b/text_generation/causal_lm/cpp/set_up_and_run.sh
@@ -22,5 +22,5 @@ cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
 cmake --build ./build/ --config Release -j
 wait
 
-convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer --streaming-detokenizer
+convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer
 ./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/ "return 0"