Fix pytorch bugs of llm bench (openvinotoolkit#192)

1. Fix typeError: not all arguments converted during string formatting 2. Fix the bug that pytorch does not output the time consumption of 1st and 2nd tokens to csv
gklodkox · Jan 30, 2024 · 9e3864e · 9e3864e
1 parent f1f6225
commit 9e3864e
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 19 deletions.
diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
@@ -13,22 +13,17 @@
 import torch
 import numpy as np
 from openvino.runtime import get_version
-from utils.config_class import DEFAULT_MODEL_CLASSES
 import PIL
 import hashlib
 import utils.metrics_print
 import utils.output_csv
-import utils.hook_greedy_search
-import utils.hook_beam_search
 import traceback
 from transformers import set_seed
 from PIL import Image
 from utils.memory_profile import MemConsumption
 from utils.hook_forward import StableDiffusionHook
 import utils.output_json
 
-HOOK_BEAM_SEARCH_UTILS = {'pt': utils.hook_beam_search, 'ov': utils.hook_beam_search}
-HOOK_GREEDY_SEARCH_UTILS = {'pt': utils.hook_greedy_search, 'ov': utils.hook_greedy_search}
 FW_UTILS = {'pt': utils.pt_utils, 'ov': utils.ov_utils}
 
 DEFAULT_INFERENCE_STEPS = 20
@@ -168,17 +163,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
 
 
 def run_text_generation_benchmark(model_path, framework, device, args, num_iters):
-    model, tokenizer, pretrain_time = FW_UTILS[framework].create_text_gen_model(model_path, device, **args)
-    # Override forward for statistic each forward time.
-    default_model_type = DEFAULT_MODEL_CLASSES[args['use_case']]
-    model_type = args.get('model_type', default_model_type)
-
-    if args['num_beams'] > 1:
-        bench_hook = HOOK_BEAM_SEARCH_UTILS[framework].BeamSearchHook()
-    else:
-        bench_hook = HOOK_GREEDY_SEARCH_UTILS[framework].GreedySearchHook()
-    bench_hook.new_forward(model, model_type)
-
+    model, tokenizer, pretrain_time, bench_hook = FW_UTILS[framework].create_text_gen_model(model_path, device, **args)
     iter_data_list = []
     input_text_list = utils.model_utils.get_prompts(args)
     if len(input_text_list) == 0:

diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py
@@ -9,6 +9,8 @@
 import torch
 import time
 import types
+import utils.hook_greedy_search
+import utils.hook_beam_search
 
 from utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES
 from .ov_model_classes import register_normalized_configs
@@ -165,13 +167,18 @@ def create_text_gen_model(model_path, device, **kwargs):
             if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']):
                 patch_inter_processing_and_compile(ov_model, **kwargs)
             end = time.perf_counter()
+    if kwargs['num_beams'] > 1:
+        bench_hook = utils.hook_beam_search.BeamSearchHook()
+    else:
+        bench_hook = utils.hook_greedy_search.GreedySearchHook()
+    bench_hook.new_forward(ov_model, model_type)
     from_pretrained_time = end - start
     log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
     # load token
     tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True)
     if kwargs.get("convert_tokenizer", False):
         tokenizer = build_ov_tokenizer(tokenizer)
-    return ov_model, tokenizer, from_pretrained_time
+    return ov_model, tokenizer, from_pretrained_time, bench_hook
 
 
 def create_image_gen_model(model_path, device, **kwargs):

diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/utils/pt_utils.py
@@ -8,6 +8,8 @@
 import time
 import logging as log
 import openvino.torch  # noqa: F401
+import utils.hook_greedy_search
+import utils.hook_beam_search
 
 MAX_CONNECT_TIME = 50
 
@@ -74,7 +76,7 @@ def create_text_gen_model(model_path, device, **kwargs):
         gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM'
         chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration'
         real_base_model_name = str(type(model)).lower()
-        log.info('Real base model=', real_base_model_name)
+        log.info(f'Real base model={real_base_model_name}')
         # bfclm will trigger generate crash.
 
         # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
@@ -93,11 +95,17 @@ def create_text_gen_model(model_path, device, **kwargs):
     else:
         raise RuntimeError('==Failure ==: no device to load')
 
+    if kwargs['num_beams'] > 1:
+        bench_hook = utils.hook_beam_search.BeamSearchHook()
+    else:
+        bench_hook = utils.hook_greedy_search.GreedySearchHook()
+    bench_hook.new_forward(model, model_type)
+
     if kwargs['torch_compile_backend']:
         backend = kwargs['torch_compile_backend']
         compiled_model = run_torch_compile(model, backend)
         model = compiled_model
-    return model, tokenizer, from_pretrain_time
+    return model, tokenizer, from_pretrain_time, bench_hook
 
 
 def create_image_gen_model(model_path, device, **kwargs):