diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index bdc307cf9..4d281d68b 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -13,13 +13,10 @@ import torch import numpy as np from openvino.runtime import get_version -from utils.config_class import DEFAULT_MODEL_CLASSES import PIL import hashlib import utils.metrics_print import utils.output_csv -import utils.hook_greedy_search -import utils.hook_beam_search import traceback from transformers import set_seed from PIL import Image @@ -27,8 +24,6 @@ from utils.hook_forward import StableDiffusionHook import utils.output_json -HOOK_BEAM_SEARCH_UTILS = {'pt': utils.hook_beam_search, 'ov': utils.hook_beam_search} -HOOK_GREEDY_SEARCH_UTILS = {'pt': utils.hook_greedy_search, 'ov': utils.hook_greedy_search} FW_UTILS = {'pt': utils.pt_utils, 'ov': utils.ov_utils} DEFAULT_INFERENCE_STEPS = 20 @@ -168,17 +163,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, def run_text_generation_benchmark(model_path, framework, device, args, num_iters): - model, tokenizer, pretrain_time = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) - # Override forward for statistic each forward time. - default_model_type = DEFAULT_MODEL_CLASSES[args['use_case']] - model_type = args.get('model_type', default_model_type) - - if args['num_beams'] > 1: - bench_hook = HOOK_BEAM_SEARCH_UTILS[framework].BeamSearchHook() - else: - bench_hook = HOOK_GREEDY_SEARCH_UTILS[framework].GreedySearchHook() - bench_hook.new_forward(model, model_type) - + model, tokenizer, pretrain_time, bench_hook = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) iter_data_list = [] input_text_list = utils.model_utils.get_prompts(args) if len(input_text_list) == 0: diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py index 306c12d92..17416304a 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/utils/ov_utils.py @@ -9,6 +9,8 @@ import torch import time import types +import utils.hook_greedy_search +import utils.hook_beam_search from utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES from .ov_model_classes import register_normalized_configs @@ -165,13 +167,18 @@ def create_text_gen_model(model_path, device, **kwargs): if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']): patch_inter_processing_and_compile(ov_model, **kwargs) end = time.perf_counter() + if kwargs['num_beams'] > 1: + bench_hook = utils.hook_beam_search.BeamSearchHook() + else: + bench_hook = utils.hook_greedy_search.GreedySearchHook() + bench_hook.new_forward(ov_model, model_type) from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') # load token tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True) if kwargs.get("convert_tokenizer", False): tokenizer = build_ov_tokenizer(tokenizer) - return ov_model, tokenizer, from_pretrained_time + return ov_model, tokenizer, from_pretrained_time, bench_hook def create_image_gen_model(model_path, device, **kwargs): diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/utils/pt_utils.py index d39f984bd..d703f4bb1 100644 --- a/llm_bench/python/utils/pt_utils.py +++ b/llm_bench/python/utils/pt_utils.py @@ -8,6 +8,8 @@ import time import logging as log import openvino.torch # noqa: F401 +import utils.hook_greedy_search +import utils.hook_beam_search MAX_CONNECT_TIME = 50 @@ -74,7 +76,7 @@ def create_text_gen_model(model_path, device, **kwargs): gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM' chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration' real_base_model_name = str(type(model)).lower() - log.info('Real base model=', real_base_model_name) + log.info(f'Real base model={real_base_model_name}') # bfclm will trigger generate crash. # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch @@ -93,11 +95,17 @@ def create_text_gen_model(model_path, device, **kwargs): else: raise RuntimeError('==Failure ==: no device to load') + if kwargs['num_beams'] > 1: + bench_hook = utils.hook_beam_search.BeamSearchHook() + else: + bench_hook = utils.hook_greedy_search.GreedySearchHook() + bench_hook.new_forward(model, model_type) + if kwargs['torch_compile_backend']: backend = kwargs['torch_compile_backend'] compiled_model = run_torch_compile(model, backend) model = compiled_model - return model, tokenizer, from_pretrain_time + return model, tokenizer, from_pretrain_time, bench_hook def create_image_gen_model(model_path, device, **kwargs):