support internvl2 (#1304)

modelscope · Jul 6, 2024 · 2443d0a · 2443d0a
1 parent 9796af9
commit 2443d0a
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ SWIFT has rich documentations for users, please check [here](https://github.com/
 SWIFT web-ui is available both on [Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) and [ModelScope studio](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary), please feel free to try!
 
 ## 🎉 News
+- 2024.07.06: Support internvl2 series: internvl2-2b, internvl2-4b, internvl2-8b, internvl2-26b.
 - 2024.07.06: Support codegeex4-9b-chat.
 - 2024.07.04: Support internlm2_5-7b series: internlm2_5-7b, internlm2_5-7b-chat, internlm2_5-7b-chat-1m.
 - 2024.07.02: Support for using vLLM for accelerating inference and deployment of multimodal large models such as the llava series and phi3-vision models. You can refer to the [Multimodal & vLLM Inference Acceleration Documentation](docs/source_en/Multi-Modal/vllm-inference-acceleration.md) for more information.
@@ -559,7 +560,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 | Llava1.5<br>Llava1.6           | [Llava series models](https://github.com/haotian-liu/LLaVA)                  | English            | 7B-34B                             | chat model |
 | Llava-Next              | [Llava-Next series models](https://github.com/LLaVA-VL/LLaVA-NeXT)                  | Chinese<br>English | 8B-110B                             | chat model |
 | mPLUG-Owl          | [mPLUG-Owl series models](https://github.com/X-PLUG/mPLUG-Owl)               | English            | 11B                                | chat model |
-| InternVL           | [InternVL](https://github.com/OpenGVLab/InternVL)                            | Chinese<br>English | 2B-25.5B<br>including quantized version                              | chat model |
+| InternVL<br>Mini-Internvl<br>Internvl2           | [InternVL](https://github.com/OpenGVLab/InternVL)                            | Chinese<br>English | 2B-25.5B<br>including quantized version                              | chat model |
 | Llava-llama3       | [xtuner](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)   | English            | 8B                                 | chat model |
 | Phi3-Vision                                      | Microsoft                        | English            | 4B              | chat model |
 | PaliGemma                                  | Google              | English | 3B              | chat model |

diff --git a/README_CN.md b/README_CN.md
@@ -48,6 +48,7 @@ SWIFT具有丰富的文档体系，如有使用问题请请查看[这里](https:
 可以在[Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) 和 [ModelScope创空间](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary) 中体验SWIFT web-ui功能了。
 
 ## 🎉 新闻
+- 2024.07.06: 支持internvl-2系列: internvl2-2b, internvl2-4b, internvl2-8b, internvl2-26b.
 - 2024.07.06: 支持codegeex4-9b-chat.
 - 2024.07.04: 支持internlm2_5-7b系列: internlm2_5-7b, internlm2_5-7b-chat, internlm2_5-7b-chat-1m.
 - 2024.07.02: 支持使用vllm对多模态大模型: llava系列, phi3-vision模型进行推理加速和部署. 可以查看[多模态&vLLM推理加速文档](docs/source/Multi-Modal/vLLM推理加速文档.md)获取更多信息.
@@ -556,7 +557,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Llava1.5<br>Llava1.6                       | [Llava系列模型](https://github.com/haotian-liu/LLaVA)                          | 英文 | 7B-34B          | chat模型 |
 | Llava-Next                                | [Llava-Next系列模型](https://github.com/LLaVA-VL/LLaVA-NeXT)                   | 中文<br>英文 | 8B-110B         | chat模型 |
 | mPLUG-Owl                                 | [mPLUG-Owl系列模型](https://github.com/X-PLUG/mPLUG-Owl)                       | 英文 | 11B             | chat模型 |
-| InternVL                                  | [InternVL](https://github.com/OpenGVLab/InternVL)                          | 中文<br>英文 | 2B-25.5B<br>包含量化版本 | chat模型 |
+| InternVL<br>Mini-Internvl<br>Internvl2                                  | [InternVL](https://github.com/OpenGVLab/InternVL)                          | 中文<br>英文 | 2B-25.5B<br>包含量化版本 | chat模型 |
 | Llava-llama3                              | [xtuner](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers) | 英文 | 8B              | chat model |
 | Phi3-Vision                                | 微软              | 英文 | 4B              | chat model |
 | PaliGemma                                  | Google              | 英文 | 3B              | chat model |

diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -348,6 +348,10 @@
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
 |mini-internvl-chat-4b-v1_5|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5/summary)|qkv_proj|internvl-phi3|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
+|internvl2-2b|[OpenGVLab/InternVL2-2B](https://modelscope.cn/models/OpenGVLab/InternVL2-2B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B)|
+|internvl2-4b|[OpenGVLab/InternVL2-4B](https://modelscope.cn/models/OpenGVLab/InternVL2-4B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B)|
+|internvl2-8b|[OpenGVLab/InternVL2-8B](https://modelscope.cn/models/OpenGVLab/InternVL2-8B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)|
+|internvl2-26b|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)|
 |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
 |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|

diff --git a/docs/source/Multi-Modal/internvl最佳实践.md b/docs/source/Multi-Modal/internvl最佳实践.md
@@ -1,5 +1,18 @@
 
 # InternVL 最佳实践
+本篇文档涉及的模型如下:
+
+- [internvl-chat-v1_5](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)
+- [internvl-chat-v1_5-int8](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)
+- [mini-internvl-chat-2b-v1_5](https://www.modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
+- [mini-internvl-chat-4b-v1_5](https://www.modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)
+- [internvl2-2b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-2B)
+- [internvl2-4b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-4B)
+- [internvl2-8b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-8B)
+- [internvl2-26b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-26B)
+
+
+以下实践以`internvl-chat-v1_5`为例，你也可以通过指定`--model_type`切换为其他模型.
 
 ## 目录
 - [环境准备](#环境准备)
@@ -18,10 +31,6 @@ pip install Pillow
 
 ## 推理
 
-推理[internvl-chat-v1.5](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)和[internvl-chat-v1.5-int8](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)
-
-下面教程以`internvl-chat-v1.5`为例，你可以修改`--model_type internvl-chat-v1_5-int8`来选择int8版本的模型，使用`mini-internvl-chat-2b-v1_5`或
-`mini-internvl-chat-4b-v1_5`来使用Mini-Internvl
 
 **注意**
 - 如果要使用本地模型文件，加上参数 `--model_id_or_path /path/to/model`
@@ -126,13 +135,13 @@ import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 from swift.llm import (
-    get_model_tokenizer, get_template, inference, ModelType,
+    get_model_tokenizer, get_template, inference,
     get_default_template_type, inference_stream
 )
 from swift.utils import seed_everything
 import torch
 
-model_type = ModelType.internvl_chat_v1_5
+model_type = "internvl-chat-v1_5"
 template_type = get_default_template_type(model_type)
 print(f'template_type: {template_type}')
 model, tokenizer = get_model_tokenizer(model_type, torch.bfloat16,

diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -348,6 +348,10 @@ The table below introcudes all models supported by SWIFT:
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
 |mini-internvl-chat-4b-v1_5|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5/summary)|qkv_proj|internvl-phi3|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
+|internvl2-2b|[OpenGVLab/InternVL2-2B](https://modelscope.cn/models/OpenGVLab/InternVL2-2B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B)|
+|internvl2-4b|[OpenGVLab/InternVL2-4B](https://modelscope.cn/models/OpenGVLab/InternVL2-4B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B)|
+|internvl2-8b|[OpenGVLab/InternVL2-8B](https://modelscope.cn/models/OpenGVLab/InternVL2-8B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)|
+|internvl2-26b|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)|
 |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
 |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|

diff --git a/docs/source_en/Multi-Modal/internvl-best-practice.md b/docs/source_en/Multi-Modal/internvl-best-practice.md
@@ -1,4 +1,16 @@
 # InternVL Best Practice
+The document corresponds to the following models:
+
+- [internvl-chat-v1_5](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)
+- [internvl-chat-v1_5-int8](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)
+- [mini-internvl-chat-2b-v1_5](https://www.modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
+- [mini-internvl-chat-4b-v1_5](https://www.modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)
+- [internvl2-2b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-2B)
+- [internvl2-4b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-4B)
+- [internvl2-8b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-8B)
+- [internvl2-26b](https://www.modelscope.cn/models/OpenGVLab/InternVL2-26B)
+
+The following practice takes `internvl-chat-v1_5` as an example, and you can also switch to other models by specifying `--model_type`.
 
 ## Table of Contents
 - [Environment Setup](#environment-setup)
@@ -16,13 +28,6 @@ pip install Pillow
 
 ## Inference
 
-Inference for [internvl-chat-v1.5](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)
-(To use a local model file, add the argument `--model_id_or_path /path/to/model`)
-
-Inference with [internvl-chat-v1.5](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary) and [internvl-chat-v1.5-int8](https://www.modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary).
-
-The tutorial below takes `internvl-chat-v1.5` as an example, and you can change to `--model_type internvl-chat-v1_5-int8` to select the INT8 version of the model. Alternatively, select the Mini-Internvl model by choosing either `mini-internvl-chat-2b-v1_5` or `mini-internvl-chat-4b-v1_5`.
-
 **Note**
 - If you want to use a local model file, add the argument --model_id_or_path /path/to/model.
 - If your GPU does not support flash attention, use the argument --use_flash_attn false. And for int8 models, it is necessary to specify `dtype --bf16` during inference, otherwise the output may be garbled.
@@ -106,13 +111,13 @@ import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 from swift.llm import (
-    get_model_tokenizer, get_template, inference, ModelType,
+    get_model_tokenizer, get_template, inference,
     get_default_template_type, inference_stream
 )
 from swift.utils import seed_everything
 import torch
 
-model_type = ModelType.internvl_chat_v1_5
+model_type = "internvl-chat-v1_5"
 template_type = get_default_template_type(model_type)
 print(f'template_type: {template_type}')
 

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -266,6 +266,10 @@ class ModelType:
     internvl_chat_v1_5_int8 = 'internvl-chat-v1_5-int8'
     mini_internvl_chat_2b_v1_5 = 'mini-internvl-chat-2b-v1_5'
     mini_internvl_chat_4b_v1_5 = 'mini-internvl-chat-4b-v1_5'
+    internvl2_2b = 'internvl2-2b'
+    internvl2_4b = 'internvl2-4b'
+    internvl2_8b = 'internvl2-8b'
+    internvl2_26b = 'internvl2-26b'
     # deepseek
     deepseek_7b = 'deepseek-7b'
     deepseek_7b_chat = 'deepseek-7b-chat'
@@ -3635,6 +3639,46 @@ def _new_forward(*args, **kwargs):
     placeholder_tokens=['<IMG_CONTEXT>'],
     tags=['multi-modal', 'vision'],
     hf_model_id='OpenGVLab/Mini-InternVL-Chat-4B-V1-5')
+@register_model(
+    ModelType.internvl2_2b,
+    'OpenGVLab/InternVL2-2B',
+    LoRATM.internlm2,
+    TemplateType.internvl,
+    requires=['transformers>=4.35', 'timm'],
+    support_flash_attn=True,
+    placeholder_tokens=['<IMG_CONTEXT>'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='OpenGVLab/InternVL2-2B')
+@register_model(
+    ModelType.internvl2_4b,
+    'OpenGVLab/InternVL2-4B',
+    LoRATM.internlm2,
+    TemplateType.internvl,
+    requires=['transformers>=4.35', 'timm'],
+    support_flash_attn=True,
+    placeholder_tokens=['<IMG_CONTEXT>'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='OpenGVLab/InternVL2-4B')
+@register_model(
+    ModelType.internvl2_8b,
+    'OpenGVLab/InternVL2-8B',
+    LoRATM.internlm2,
+    TemplateType.internvl,
+    requires=['transformers>=4.35', 'timm'],
+    support_flash_attn=True,
+    placeholder_tokens=['<IMG_CONTEXT>'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='OpenGVLab/InternVL2-8B')
+@register_model(
+    ModelType.internvl2_26b,
+    'OpenGVLab/InternVL2-26B',
+    LoRATM.internlm2,
+    TemplateType.internvl,
+    requires=['transformers>=4.35', 'timm'],
+    support_flash_attn=True,
+    placeholder_tokens=['<IMG_CONTEXT>'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='OpenGVLab/InternVL2-26B')
 def get_model_tokenizer_internvl(model_dir: str,
                                  torch_dtype: Dtype,
                                  model_kwargs: Dict[str, Any],