From f61cb8d4076c92be30f2244f9825cd8ff178f526 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Fri, 1 Mar 2024 10:04:24 -0800 Subject: [PATCH] community[minor]: Add openvino backend support (#11591) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Description:** add openvino backend support by HuggingFace Optimum Intel, - **Dependencies:** “optimum[openvino]”, --------- Co-authored-by: Bagatur --- .../llms/huggingface_pipelines.ipynb | 90 ++++++++++++++++++- .../integrations/platforms/huggingface.mdx | 16 ++++ .../llms/huggingface_pipeline.py | 71 +++++++++++++-- .../llms/test_huggingface_pipeline.py | 42 +++++++++ 4 files changed, 212 insertions(+), 7 deletions(-) diff --git a/docs/docs/integrations/llms/huggingface_pipelines.ipynb b/docs/docs/integrations/llms/huggingface_pipelines.ipynb index b4e919a1d5993..6b2f48c9d736e 100644 --- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb +++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb @@ -192,6 +192,94 @@ "for answer in answers:\n", " print(answer)" ] + }, + { + "cell_type": "markdown", + "id": "df1d41d9", + "metadata": {}, + "source": [ + "### Inference with OpenVINO backend\n", + "\n", + "To deploy a model with OpenVINO, you can specify the `backend=\"openvino\"` parameter to trigger OpenVINO as backend inference framework.\n", + "\n", + "If you have an Intel GPU, you can specify `model_kwargs={\"device\": \"GPU\"}` to run inference on it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efb73dd7-77bf-4436-92e5-51306af45bd7", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade-strategy eager \"optimum[openvino,nncf]\" --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70f6826c", + "metadata": {}, + "outputs": [], + "source": [ + "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"NUM_STREAMS\": \"1\", \"CACHE_DIR\": \"\"}\n", + "\n", + "ov_llm = HuggingFacePipeline.from_model_id(\n", + " model_id=\"gpt2\",\n", + " task=\"text-generation\",\n", + " backend=\"openvino\",\n", + " model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n", + " pipeline_kwargs={\"max_new_tokens\": 10},\n", + ")\n", + "\n", + "ov_chain = prompt | ov_llm\n", + "\n", + "question = \"What is electroencephalography?\"\n", + "\n", + "print(ov_chain.invoke({\"question\": question}))" + ] + }, + { + "cell_type": "markdown", + "id": "12524837-e9ab-455a-86be-66b95f4f893a", + "metadata": {}, + "source": [ + "### Inference with local OpenVINO model\n", + "\n", + "It is possible to [export your model](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#export) to the OpenVINO IR format with the CLI, and load the model from local folder.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d1104a2-79c7-43a6-aa1c-8076a5ad7747", + "metadata": {}, + "outputs": [], + "source": [ + "!optimum-cli export openvino --model gpt2 ov_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac71e60d-5595-454e-8602-03ebb0248205", + "metadata": {}, + "outputs": [], + "source": [ + "ov_llm = HuggingFacePipeline.from_model_id(\n", + " model_id=\"ov_model\",\n", + " task=\"text-generation\",\n", + " backend=\"openvino\",\n", + " model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n", + " pipeline_kwargs={\"max_new_tokens\": 10},\n", + ")\n", + "\n", + "ov_chain = prompt | ov_llm\n", + "\n", + "question = \"What is electroencephalography?\"\n", + "\n", + "print(ov_chain.invoke({\"question\": question}))" + ] } ], "metadata": { @@ -210,7 +298,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/docs/integrations/platforms/huggingface.mdx b/docs/docs/integrations/platforms/huggingface.mdx index 927464a5e9795..b985a3b9568c4 100644 --- a/docs/docs/integrations/platforms/huggingface.mdx +++ b/docs/docs/integrations/platforms/huggingface.mdx @@ -40,6 +40,22 @@ See a [usage example](/docs/integrations/llms/huggingface_pipelines). from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline ``` +To use the OpenVINO backend in local pipeline wrapper, please install the optimum library and set HuggingFacePipeline's backend as `openvino`: + +```bash +pip install --upgrade-strategy eager "optimum[openvino,nncf]" +``` + +See a [usage example](/docs/integrations/llms/huggingface_pipelines) + +To export your model to the OpenVINO IR format with the CLI: + +```bash +optimum-cli export openvino --model gpt2 ov_model +``` + +To apply [weight-only quantization](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#export) when exporting your model. + ### Hugging Face TextGen Inference >[Text Generation Inference](https://github.com/huggingface/text-generation-inference) is diff --git a/libs/community/langchain_community/llms/huggingface_pipeline.py b/libs/community/langchain_community/llms/huggingface_pipeline.py index 7a2b915054fb5..f95a994d68fa0 100644 --- a/libs/community/langchain_community/llms/huggingface_pipeline.py +++ b/libs/community/langchain_community/llms/huggingface_pipeline.py @@ -68,6 +68,7 @@ def from_model_id( cls, model_id: str, task: str, + backend: str = "default", device: Optional[int] = -1, device_map: Optional[str] = None, model_kwargs: Optional[dict] = None, @@ -95,9 +96,57 @@ def from_model_id( try: if task == "text-generation": - model = AutoModelForCausalLM.from_pretrained(model_id, **_model_kwargs) + if backend == "openvino": + try: + from optimum.intel.openvino import OVModelForCausalLM + + except ImportError: + raise ValueError( + "Could not import optimum-intel python package. " + "Please install it with: " + "pip install 'optimum[openvino,nncf]' " + ) + try: + # use local model + model = OVModelForCausalLM.from_pretrained( + model_id, **_model_kwargs + ) + + except Exception: + # use remote model + model = OVModelForCausalLM.from_pretrained( + model_id, export=True, **_model_kwargs + ) + else: + model = AutoModelForCausalLM.from_pretrained( + model_id, **_model_kwargs + ) elif task in ("text2text-generation", "summarization"): - model = AutoModelForSeq2SeqLM.from_pretrained(model_id, **_model_kwargs) + if backend == "openvino": + try: + from optimum.intel.openvino import OVModelForSeq2SeqLM + + except ImportError: + raise ValueError( + "Could not import optimum-intel python package. " + "Please install it with: " + "pip install 'optimum[openvino,nncf]' " + ) + try: + # use local model + model = OVModelForSeq2SeqLM.from_pretrained( + model_id, **_model_kwargs + ) + + except Exception: + # use remote model + model = OVModelForSeq2SeqLM.from_pretrained( + model_id, export=True, **_model_kwargs + ) + else: + model = AutoModelForSeq2SeqLM.from_pretrained( + model_id, **_model_kwargs + ) else: raise ValueError( f"Got invalid task {task}, " @@ -112,9 +161,13 @@ def from_model_id( tokenizer.pad_token_id = model.config.eos_token_id if ( - getattr(model, "is_loaded_in_4bit", False) - or getattr(model, "is_loaded_in_8bit", False) - ) and device is not None: + ( + getattr(model, "is_loaded_in_4bit", False) + or getattr(model, "is_loaded_in_8bit", False) + ) + and device is not None + and backend == "default" + ): logger.warning( f"Setting the `device` argument to None from {device} to avoid " "the error caused by attempting to move the model that was already " @@ -123,7 +176,11 @@ def from_model_id( ) device = None - if device is not None and importlib.util.find_spec("torch") is not None: + if ( + device is not None + and importlib.util.find_spec("torch") is not None + and backend == "default" + ): import torch cuda_device_count = torch.cuda.device_count() @@ -142,6 +199,8 @@ def from_model_id( "can be a positive integer associated with CUDA device id.", cuda_device_count, ) + if device is not None and device_map is not None and backend == "openvino": + logger.warning("Please set device for OpenVINO through: " "'model_kwargs'") if "trust_remote_code" in _model_kwargs: _model_kwargs = { k: v for k, v in _model_kwargs.items() if k != "trust_remote_code" diff --git a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline.py b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline.py index aa6a0e1defe8d..3928046913364 100755 --- a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline.py +++ b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline.py @@ -80,3 +80,45 @@ def test_huggingface_pipeline_runtime_kwargs() -> None: prompt = "Say foo:" output = llm(prompt, pipeline_kwargs={"max_new_tokens": 2}) assert len(output) < 10 + + +ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""} + + +def test_huggingface_pipeline_text_generation_ov() -> None: + """Test valid call to HuggingFace text generation model with openvino.""" + llm = HuggingFacePipeline.from_model_id( + model_id="gpt2", + task="text-generation", + backend="openvino", + model_kwargs={"device": "CPU", "ov_config": ov_config}, + pipeline_kwargs={"max_new_tokens": 64}, + ) + output = llm("Say foo:") + assert isinstance(output, str) + + +def test_huggingface_pipeline_text2text_generation_ov() -> None: + """Test valid call to HuggingFace text2text generation model with openvino.""" + llm = HuggingFacePipeline.from_model_id( + model_id="google/flan-t5-small", + task="text2text-generation", + backend="openvino", + model_kwargs={"device": "CPU", "ov_config": ov_config}, + pipeline_kwargs={"max_new_tokens": 64}, + ) + output = llm("Say foo:") + assert isinstance(output, str) + + +def text_huggingface_pipeline_summarization_ov() -> None: + """Test valid call to HuggingFace summarization model with openvino.""" + llm = HuggingFacePipeline.from_model_id( + model_id="facebook/bart-large-cnn", + task="summarization", + backend="openvino", + model_kwargs={"device": "CPU", "ov_config": ov_config}, + pipeline_kwargs={"max_new_tokens": 64}, + ) + output = llm("Say foo:") + assert isinstance(output, str)