From 9fa6fdb7aaf234a4f5859e0ab08d219e4fb775ea Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 14:58:18 +0800 Subject: [PATCH 01/12] add deepseek coder2 --- swift/llm/utils/model.py | 23 +++++++++++++++++++++++ swift/llm/utils/template.py | 8 ++++++++ 2 files changed, 31 insertions(+) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 735099f33..89109f959 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -271,6 +271,9 @@ class ModelType: deepseek_coder_6_7b_instruct = 'deepseek-coder-6_7b-instruct' deepseek_coder_33b = 'deepseek-coder-33b' deepseek_coder_33b_instruct = 'deepseek-coder-33b-instruct' + #deepseek2-coder + deepseek_coder_v2_instruct = 'deepseek-coder-v2-instruct' + deepseek_coder_v2_lite_instruct = 'deepseek-coder-v2-lite-instruct' # deepseek-math deepseek_math_7b = 'deepseek-math-7b' deepseek_math_7b_instruct = 'deepseek-math-7b-instruct' @@ -1860,6 +1863,26 @@ def _output_device_map_hook(module, input, output): support_vllm=True, tags=['coding'], hf_model_id='deepseek-ai/deepseek-coder-33b-instruct') +@register_model( + ModelType.deepseek_coder_v2_instruct, + 'deepseek-ai/DeepSeek-Coder-V2-Instruct', + LoRATM.llama, + TemplateType.deepseek_coder2, + eos_token='<|end▁of▁sentence|>', + support_flash_attn=True, + support_vllm=True, + tags=['coding'], + hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct') +@register_model( + ModelType.deepseek_coder_v2_lite_instruct, + 'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', + LoRATM.llama, + TemplateType.deepseek_coder2, + eos_token='<|end▁of▁sentence|>', + support_flash_attn=True, + support_vllm=True, + tags=['coding'], + hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct') @register_model( ModelType.openbuddy_deepseek_67b_chat, 'OpenBuddy/openbuddy-deepseek-67b-v15.2', diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 046753bc1..74d4500d3 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -64,6 +64,7 @@ class TemplateType: sus = 'sus' deepseek = 'deepseek' deepseek_coder = 'deepseek-coder' + deepseek_coder2 = 'deepseek-coder2' deepseek_vl = 'deepseek-vl' deepseek2 = 'deepseek2' codefuse_codellama = 'codefuse-codellama' @@ -1311,6 +1312,13 @@ def __init__(self): 'and other non-computer science questions, you will refuse to answer\n'))) +register_template( + TemplateType.deepseek_coder2, + Template(['<|begin▁of▁sentence|>{{SYSTEM}}'], ['User: {{QUERY}}\n\n Assistant: \n'], + ['<|end▁of▁sentence|>'], ['<|end▁of▁sentence|>'], + None)) + + class LLavaTemplate(Template): def __init__(self): From 7fb92514eb21beb540b6e48aa7ba32075b52194b Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Tue, 18 Jun 2024 16:41:48 +0800 Subject: [PATCH 02/12] fix --- swift/llm/utils/model.py | 40 ++++++++++++++++++------------------- swift/llm/utils/template.py | 7 ------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 89109f959..77896edc6 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -1863,26 +1863,6 @@ def _output_device_map_hook(module, input, output): support_vllm=True, tags=['coding'], hf_model_id='deepseek-ai/deepseek-coder-33b-instruct') -@register_model( - ModelType.deepseek_coder_v2_instruct, - 'deepseek-ai/DeepSeek-Coder-V2-Instruct', - LoRATM.llama, - TemplateType.deepseek_coder2, - eos_token='<|end▁of▁sentence|>', - support_flash_attn=True, - support_vllm=True, - tags=['coding'], - hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct') -@register_model( - ModelType.deepseek_coder_v2_lite_instruct, - 'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', - LoRATM.llama, - TemplateType.deepseek_coder2, - eos_token='<|end▁of▁sentence|>', - support_flash_attn=True, - support_vllm=True, - tags=['coding'], - hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct') @register_model( ModelType.openbuddy_deepseek_67b_chat, 'OpenBuddy/openbuddy-deepseek-67b-v15.2', @@ -3147,6 +3127,26 @@ def get_model_tokenizer_internlm2(model_dir: str, return model, tokenizer +@register_model( + ModelType.deepseek_coder_v2_instruct, + 'deepseek-ai/DeepSeek-Coder-V2-Instruct', + LoRATM.deepseek2, + TemplateType.deepseek2, + eos_token='<|end▁of▁sentence|>', + support_flash_attn=True, + support_vllm=True, + tags=['coding'], + hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct') +@register_model( + ModelType.deepseek_coder_v2_lite_instruct, + 'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', + LoRATM.deepseek2, + TemplateType.deepseek2, + eos_token='<|end▁of▁sentence|>', + support_flash_attn=True, + support_vllm=True, + tags=['coding'], + hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct') @register_model( ModelType.deepseek_v2_lite, 'deepseek-ai/DeepSeek-V2-Lite', diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 74d4500d3..3f37e2e8e 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -1312,13 +1312,6 @@ def __init__(self): 'and other non-computer science questions, you will refuse to answer\n'))) -register_template( - TemplateType.deepseek_coder2, - Template(['<|begin▁of▁sentence|>{{SYSTEM}}'], ['User: {{QUERY}}\n\n Assistant: \n'], - ['<|end▁of▁sentence|>'], ['<|end▁of▁sentence|>'], - None)) - - class LLavaTemplate(Template): def __init__(self): From a8ceff299f5ed3a215a79dd1adcde952be8f7c7a Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 16:44:01 +0800 Subject: [PATCH 03/12] fix --- swift/llm/utils/model.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 77896edc6..c64446c3c 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -3132,27 +3132,26 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-Coder-V2-Instruct', LoRATM.deepseek2, TemplateType.deepseek2, - eos_token='<|end▁of▁sentence|>', support_flash_attn=True, support_vllm=True, tags=['coding'], + requires=['transformers>=4.39.3'], hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct') @register_model( ModelType.deepseek_coder_v2_lite_instruct, 'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', LoRATM.deepseek2, TemplateType.deepseek2, - eos_token='<|end▁of▁sentence|>', support_flash_attn=True, support_vllm=True, tags=['coding'], + requires=['transformers>=4.39.3'], hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct') @register_model( ModelType.deepseek_v2_lite, 'deepseek-ai/DeepSeek-V2-Lite', LoRATM.deepseek2, TemplateType.default_generation, - support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, requires=['transformers>=4.39.3'], @@ -3162,7 +3161,6 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-V2-Lite-Chat', LoRATM.deepseek2, TemplateType.deepseek2, - support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, requires=['transformers>=4.39.3'], @@ -3172,7 +3170,6 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-V2-Chat', LoRATM.deepseek2, TemplateType.deepseek2, - support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, requires=['transformers>=4.39.3'], From 01717453aaf53a688feb8b98af8a69e991ce28b2 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 16:57:05 +0800 Subject: [PATCH 04/12] fix --- swift/llm/utils/template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 3f37e2e8e..57b75e0b9 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -18,7 +18,7 @@ DEFAULT_SYSTEM = 'You are a helpful assistant.' History = List[Union[Tuple[str, str], List[str]]] -Prompt = List[Union[str, List[str]]] +Prompt = List[Union[str, List[str], List[int]]] StopWords = Prompt Context = Union[str, List[int]] TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} From 51551ce46c64cecd7f96bb086958510ae2664f61 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 16:58:06 +0800 Subject: [PATCH 05/12] fix --- swift/llm/utils/model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index c64446c3c..590fdd5e7 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -3132,6 +3132,7 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-Coder-V2-Instruct', LoRATM.deepseek2, TemplateType.deepseek2, + support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, tags=['coding'], @@ -3142,6 +3143,7 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', LoRATM.deepseek2, TemplateType.deepseek2, + support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, tags=['coding'], @@ -3152,6 +3154,7 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-V2-Lite', LoRATM.deepseek2, TemplateType.default_generation, + support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, requires=['transformers>=4.39.3'], @@ -3161,6 +3164,7 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-V2-Lite-Chat', LoRATM.deepseek2, TemplateType.deepseek2, + support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, requires=['transformers>=4.39.3'], @@ -3170,6 +3174,7 @@ def get_model_tokenizer_internlm2(model_dir: str, 'deepseek-ai/DeepSeek-V2-Chat', LoRATM.deepseek2, TemplateType.deepseek2, + support_gradient_checkpointing=False, support_flash_attn=True, support_vllm=True, requires=['transformers>=4.39.3'], From 46fe8915791abb40ac8e5686d548c2617e28c88e Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 17:00:26 +0800 Subject: [PATCH 06/12] add doc --- README.md | 1 + README_CN.md | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a3d17def..363a07979 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ SWIFT has rich documentations for users, please check [here](https://github.com/ SWIFT web-ui is available both on [Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) and [ModelScope studio](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary), please feel free to try! ## 🎉 News +- 🔥2024.06.18: Supoprts **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin. - 🔥2024.06.16: Supoprts **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training! - 2024.06.11: Support for tool-calling agent deployment that conform to the OpenAI interface.You can refer to [Agent deployment best practice](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md) - 🔥2024.06.07: Support **Qwen2** series LLM, including Base and Instruct models of 0.5B, 1.5B, 7B, and 72B, as well as corresponding quantized versions gptq-int4, gptq-int8, and awq-int4. The best practice for self-cognition fine-tuning, inference and deployment of Qwen2-72B-Instruct using dual-card 80GiB A100 can be found [here](https://github.com/modelscope/swift/issues/1092). diff --git a/README_CN.md b/README_CN.md index daf21973f..58d122fbd 100644 --- a/README_CN.md +++ b/README_CN.md @@ -48,7 +48,8 @@ SWIFT具有丰富的文档体系,如有使用问题请请查看[这里](https: 可以在[Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) 和 [ModelScope创空间](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary) 中体验SWIFT web-ui功能了。 ## 🎉 新闻 -- 🔥2024.06.16: 支持**KTO**和**CPO**训练,使用`swift rlhf --rlhf_type kto`和`swift rlhf --rlhf_type cpo`来开始训练,可以参考[文档](./docs/source/LLM/人类偏好对齐训练文档.md) +- 🔥2024.06.18: 支持**DeepSeek-Coder-v2**系列模型! 使用model_type`deepseek-coder-v2-instruct`和`deepseek-coder-v2-lite-instruct`来开启训练和推理. +- 🔥2024.06.16: 支持**KTO**和**CPO**训练,使用`swift rlhf --rlhf_type kto`和`swift rlhf --rlhf_type cpo`来开始训练,可以参考[文档](./docs/source/LLM/人类偏好对齐训练文档.md). - 2024.06.11: 支持符合OpenAI接口的工具调用Agent部署, 可以查看[Agent部署最佳实践](docs/source/LLM/Agent部署最佳实践.md). - 🔥2024.06.07: 支持**Qwen2**系列LLM, 包括0.5B、1.5B、7B、72B的Base和Instruct模型, 以及对应的gptq-int4、gptq-int8、awq-int4量化版本. 使用双卡80GiB A100对Qwen2-72B-Instruct进行自我认知微调并推理部署的最佳实践可以查看[这里](https://github.com/modelscope/swift/issues/1092). - 🔥2024.06.05: 支持glm4系列大模型和glm4v-9b-chat多模态大模型, 可以查看[glm4v最佳实践](docs/source/Multi-Modal/glm4v最佳实践.md). From 3dbc3eac14a7873cf9f2574f2407fcb6de175f37 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 17:01:20 +0800 Subject: [PATCH 07/12] fix typo --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 363a07979..389cc7157 100644 --- a/README.md +++ b/README.md @@ -47,12 +47,12 @@ SWIFT has rich documentations for users, please check [here](https://github.com/ SWIFT web-ui is available both on [Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) and [ModelScope studio](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary), please feel free to try! ## 🎉 News -- 🔥2024.06.18: Supoprts **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin. -- 🔥2024.06.16: Supoprts **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training! +- 🔥2024.06.18: Supports **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin. +- 🔥2024.06.16: Supports **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training! - 2024.06.11: Support for tool-calling agent deployment that conform to the OpenAI interface.You can refer to [Agent deployment best practice](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md) - 🔥2024.06.07: Support **Qwen2** series LLM, including Base and Instruct models of 0.5B, 1.5B, 7B, and 72B, as well as corresponding quantized versions gptq-int4, gptq-int8, and awq-int4. The best practice for self-cognition fine-tuning, inference and deployment of Qwen2-72B-Instruct using dual-card 80GiB A100 can be found [here](https://github.com/modelscope/swift/issues/1092). - 🔥2024.06.05: Support for **glm4** series LLM and glm4v-9b-chat MLLM. You can refer to [glm4v best practice](docs/source_en/Multi-Modal/glm4v-best-practice.md). -- 🔥2024.06.01: Supoprts **SimPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/SimPO.md) to start training! +- 🔥2024.06.01: Supports **SimPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/SimPO.md) to start training! - 🔥2024.06.01: Support for deploying large multimodal models, please refer to the [Multimodal Deployment Documentation](docs/source_en/Multi-Modal/mutlimodal-deployment.md) for more information. - 2024.05.31: Supports Mini-Internvl model, Use model_type `mini-internvl-chat-2b-v1_5` and `mini-internvl-chat-4b-v1_5`to train. - 2024.05.24: Supports Phi3-vision model, Use model_type `phi3-vision-128k-instruct` to train. From a170203c3bc55a03669a7b7b7f659d35475e6bab Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Tue, 18 Jun 2024 17:02:30 +0800 Subject: [PATCH 08/12] add docs --- ...\213\345\222\214\346\225\260\346\215\256\351\233\206.md" | 6 ++++-- docs/source_en/LLM/Supported-models-datasets.md | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 155e44100..6a87b5d37 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -204,6 +204,8 @@ |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)| |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)| |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)| +|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)| +|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)| |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)| |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)| |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)| @@ -283,7 +285,7 @@ |phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)| |phi3-4b-4k-instruct|[LLM-Research/Phi-3-mini-4k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-4k-instruct/summary)|qkv_proj|phi3|✔|✘|transformers>=4.36|general|[microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)| |phi3-4b-128k-instruct|[LLM-Research/Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)|qkv_proj|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)| -|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|qkv_proj|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| +|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)| |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| @@ -308,7 +310,7 @@ | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID | | --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- | |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md index 5cb17c8a0..2876d5dd5 100644 --- a/docs/source_en/LLM/Supported-models-datasets.md +++ b/docs/source_en/LLM/Supported-models-datasets.md @@ -204,6 +204,8 @@ The table below introcudes all models supported by SWIFT: |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)| |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)| |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)| +|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)| +|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)| |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)| |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)| |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)| @@ -283,7 +285,7 @@ The table below introcudes all models supported by SWIFT: |phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)| |phi3-4b-4k-instruct|[LLM-Research/Phi-3-mini-4k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-4k-instruct/summary)|qkv_proj|phi3|✔|✘|transformers>=4.36|general|[microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)| |phi3-4b-128k-instruct|[LLM-Research/Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)|qkv_proj|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)| -|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|qkv_proj|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| +|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|✔|✔|transformers>=4.36|general|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)| |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| @@ -308,7 +310,7 @@ The table below introcudes all models supported by SWIFT: | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID | | --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- | |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| From 743ba27cd8d6266b56082743322736d9af6aa48d Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 17:03:59 +0800 Subject: [PATCH 09/12] fix doc --- README.md | 74 ++++++++++++++++++++++++++-------------------------- README_CN.md | 2 +- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 389cc7157..0f4efd02a 100644 --- a/README.md +++ b/README.md @@ -498,44 +498,44 @@ The complete list of supported models and datasets can be found at [Supported Mo #### LLMs -| Model Type | Model Introduction | Language | Model Size | Model Type | +| Model Type | Model Introduction | Language | Model Size | Model Type | |------------------------------------------------|------------------------------------------------------------------------|--------------------|----------------------------------------|------------------------------------------- | -| Qwen
Qwen1.5
Qwen2 | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM) | Chinese
English | 0.5B-110B
including quantized versions | base model
chat model
MoE model
code model | -| ChatGLM2
ChatGLM3
Codegeex2
GLM4 | [Zhipu ChatGLM series models](https://github.com/THUDM) | Chinese
English | 6B-9B | base model
chat model
code model
long text model | -| Baichuan/Baichuan2 | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc) | Chinese
English | 7B-13B
including quantized versions | base model
chat model | -| Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese
English | 2B-102B | instruct model | -| XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model | -| LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model | -| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B
including quantized versions | base model
chat model | -| Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model | -| Yi
Yi1.5 | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model | -| InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model | -| DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math
DeepSeek-V2 | [DeepSeek series models](https://github.com/deepseek-ai) | Chinese
English | 1.3B-236B | base model
chat model
MoE model
code model
math model | -| MAMBA | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English | 130M-2.8B | base model | -| Gemma | [Google Gemma series models](https://github.com/google/gemma_pytorch) | English | 2B-7B | base model
instruct model | -| MiniCPM | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM) | Chinese
English | 2B-3B | chat model
MoE model | -| OpenBuddy | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy) | Chinese
English | 7B-67B | base model
chat model | -| Orion | [OrionStar AI series models](https://github.com/OrionStarAI) | Chinese
English | 14B | base model
chat model | -| BlueLM | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM) | Chinese
English | 7B | base model
chat model | -| Ziya2 | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese
English | 13B | base model
chat model | -| Skywork | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese
English | 13B | base model
chat model | -| Zephyr | Zephyr series models based on Mistral | English | 7B | chat model | -| PolyLM | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B | base model | -| SeqGPT | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M | semantic understanding model | -| SUS | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese
English | 34B | chat model | -| Tongyi-Finance | [Tongyi finance series models](https://github.com/QwenLM/Qwen) | Chinese
English | 14B | base model
chat model
financial model | -| CodeFuse-CodeLLaMA
CodeFuse-Codegeex2
CodeFuse-Qwen | [Ant CodeFuse series models](https://github.com/codefuse-ai) | Chinese
English | 6B-34B | chat model
code model | -| phi2/phi3 | Microsoft's PHI series models | English | 3B/4B | base model
instruct model
code model | -| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model | -| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese
English | 7B-12B | chat model | -| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model
chat model | -| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese
English | 13B | base model | -| c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model | -| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B
including quantized versions | chat model
MoE model | -| Atom | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model
chat model| -| Chinese-LLaMA-Alpaca-2 | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model
chat model
long text model | -| Chinese-LLaMA-Alpaca-3 | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model
chat model| -| ModelScope-Agent | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model | +| Qwen
Qwen1.5
Qwen2 | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM) | Chinese
English | 0.5B-110B
including quantized versions | base model
chat model
MoE model
code model | +| ChatGLM2
ChatGLM3
Codegeex2
GLM4 | [Zhipu ChatGLM series models](https://github.com/THUDM) | Chinese
English | 6B-9B | base model
chat model
code model
long text model | +| Baichuan/Baichuan2 | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc) | Chinese
English | 7B-13B
including quantized versions | base model
chat model | +| Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese
English | 2B-102B | instruct model | +| XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model | +| LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model | +| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B
including quantized versions | base model
chat model | +| Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model | +| Yi
Yi1.5 | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model | +| InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model | +| DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math
DeepSeek-V2
DeepSeek-Coder-V2 | [DeepSeek series models](https://github.com/deepseek-ai) | Chinese
English | 1.3B-236B | base model
chat model
MoE model
code model
math model | +| MAMBA | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English | 130M-2.8B | base model | +| Gemma | [Google Gemma series models](https://github.com/google/gemma_pytorch) | English | 2B-7B | base model
instruct model | +| MiniCPM | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM) | Chinese
English | 2B-3B | chat model
MoE model | +| OpenBuddy | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy) | Chinese
English | 7B-67B | base model
chat model | +| Orion | [OrionStar AI series models](https://github.com/OrionStarAI) | Chinese
English | 14B | base model
chat model | +| BlueLM | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM) | Chinese
English | 7B | base model
chat model | +| Ziya2 | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese
English | 13B | base model
chat model | +| Skywork | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese
English | 13B | base model
chat model | +| Zephyr | Zephyr series models based on Mistral | English | 7B | chat model | +| PolyLM | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B | base model | +| SeqGPT | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M | semantic understanding model | +| SUS | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese
English | 34B | chat model | +| Tongyi-Finance | [Tongyi finance series models](https://github.com/QwenLM/Qwen) | Chinese
English | 14B | base model
chat model
financial model | +| CodeFuse-CodeLLaMA
CodeFuse-Codegeex2
CodeFuse-Qwen | [Ant CodeFuse series models](https://github.com/codefuse-ai) | Chinese
English | 6B-34B | chat model
code model | +| phi2/phi3 | Microsoft's PHI series models | English | 3B/4B | base model
instruct model
code model | +| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model | +| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese
English | 7B-12B | chat model | +| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model
chat model | +| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese
English | 13B | base model | +| c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model | +| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B
including quantized versions | chat model
MoE model | +| Atom | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model
chat model| +| Chinese-LLaMA-Alpaca-2 | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model
chat model
long text model | +| Chinese-LLaMA-Alpaca-3 | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model
chat model| +| ModelScope-Agent | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model | #### MLLMs diff --git a/README_CN.md b/README_CN.md index 58d122fbd..6c7265fc3 100644 --- a/README_CN.md +++ b/README_CN.md @@ -506,7 +506,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | Mistral
Mixtral | [Mistral系列模型](https://github.com/mistralai/mistral-src) | 英文 | 7B-8x22B | base模型
instruct模型
MoE模型 | | Yi
Yi1.5 | [01AI的YI系列模型](https://github.com/01-ai) | 中文
英文 | 6B-34B
包含量化版本 | base模型
chat模型
长文本模型 | | InternLM
InternLM2
InternLM2-Math | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文
英文 | 1.8B-20B | base模型
chat模型
数学模型 | -| DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math
DeepSeek-V2 | [幻方系列模型](https://github.com/deepseek-ai) | 中文
英文 | 1.3B-236B | base模型
chat模型
MoE模型
代码模型
数学模型 | +| DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math
DeepSeek-V2
DeepSeek-Coder-V2 | [幻方系列模型](https://github.com/deepseek-ai) | 中文
英文 | 1.3B-236B | base模型
chat模型
MoE模型
代码模型
数学模型 | | MAMBA | [MAMBA时序卷积模型](https://github.com/state-spaces/mamba) | 英文 | 130M-2.8B | base模型 | | Gemma | [Google Gemma系列模型](https://github.com/google/gemma_pytorch) | 英文 | 2B-7B | base模型
instruct模型 | | MiniCPM | [OpenBmB MiniCPM系列模型](https://github.com/OpenBMB/MiniCPM) | 中文
英文 | 2B-3B | chat模型
MoE模型 | From f95ae9e7a20c759a4e1fbac08497bc548597914c Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 17:06:03 +0800 Subject: [PATCH 10/12] fix --- swift/llm/utils/template.py | 1 - 1 file changed, 1 deletion(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 57b75e0b9..f09d4c4fb 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -64,7 +64,6 @@ class TemplateType: sus = 'sus' deepseek = 'deepseek' deepseek_coder = 'deepseek-coder' - deepseek_coder2 = 'deepseek-coder2' deepseek_vl = 'deepseek-vl' deepseek2 = 'deepseek2' codefuse_codellama = 'codefuse-codellama' From 725aea671287c94344b98b66b3302a87d1fb5b33 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 17:06:55 +0800 Subject: [PATCH 11/12] fix doc --- README.md | 74 +++++++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 0f4efd02a..93ff61ec1 100644 --- a/README.md +++ b/README.md @@ -498,44 +498,44 @@ The complete list of supported models and datasets can be found at [Supported Mo #### LLMs -| Model Type | Model Introduction | Language | Model Size | Model Type | +| Model Type | Model Introduction | Language | Model Size | Model Type | |------------------------------------------------|------------------------------------------------------------------------|--------------------|----------------------------------------|------------------------------------------- | -| Qwen
Qwen1.5
Qwen2 | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM) | Chinese
English | 0.5B-110B
including quantized versions | base model
chat model
MoE model
code model | -| ChatGLM2
ChatGLM3
Codegeex2
GLM4 | [Zhipu ChatGLM series models](https://github.com/THUDM) | Chinese
English | 6B-9B | base model
chat model
code model
long text model | -| Baichuan/Baichuan2 | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc) | Chinese
English | 7B-13B
including quantized versions | base model
chat model | -| Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese
English | 2B-102B | instruct model | -| XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model | -| LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model | -| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B
including quantized versions | base model
chat model | -| Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model | -| Yi
Yi1.5 | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model | -| InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model | -| DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math
DeepSeek-V2
DeepSeek-Coder-V2 | [DeepSeek series models](https://github.com/deepseek-ai) | Chinese
English | 1.3B-236B | base model
chat model
MoE model
code model
math model | -| MAMBA | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English | 130M-2.8B | base model | -| Gemma | [Google Gemma series models](https://github.com/google/gemma_pytorch) | English | 2B-7B | base model
instruct model | -| MiniCPM | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM) | Chinese
English | 2B-3B | chat model
MoE model | -| OpenBuddy | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy) | Chinese
English | 7B-67B | base model
chat model | -| Orion | [OrionStar AI series models](https://github.com/OrionStarAI) | Chinese
English | 14B | base model
chat model | -| BlueLM | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM) | Chinese
English | 7B | base model
chat model | -| Ziya2 | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese
English | 13B | base model
chat model | -| Skywork | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese
English | 13B | base model
chat model | -| Zephyr | Zephyr series models based on Mistral | English | 7B | chat model | -| PolyLM | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B | base model | -| SeqGPT | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M | semantic understanding model | -| SUS | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese
English | 34B | chat model | -| Tongyi-Finance | [Tongyi finance series models](https://github.com/QwenLM/Qwen) | Chinese
English | 14B | base model
chat model
financial model | -| CodeFuse-CodeLLaMA
CodeFuse-Codegeex2
CodeFuse-Qwen | [Ant CodeFuse series models](https://github.com/codefuse-ai) | Chinese
English | 6B-34B | chat model
code model | -| phi2/phi3 | Microsoft's PHI series models | English | 3B/4B | base model
instruct model
code model | -| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model | -| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese
English | 7B-12B | chat model | -| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model
chat model | -| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese
English | 13B | base model | -| c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model | -| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B
including quantized versions | chat model
MoE model | -| Atom | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model
chat model| -| Chinese-LLaMA-Alpaca-2 | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model
chat model
long text model | -| Chinese-LLaMA-Alpaca-3 | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model
chat model| -| ModelScope-Agent | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model | +| Qwen
Qwen1.5
Qwen2 | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM) | Chinese
English | 0.5B-110B
including quantized versions | base model
chat model
MoE model
code model | +| ChatGLM2
ChatGLM3
Codegeex2
GLM4 | [Zhipu ChatGLM series models](https://github.com/THUDM) | Chinese
English | 6B-9B | base model
chat model
code model
long text model | +| Baichuan/Baichuan2 | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc) | Chinese
English | 7B-13B
including quantized versions | base model
chat model | +| Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese
English | 2B-102B | instruct model | +| XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model | +| LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model | +| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B
including quantized versions | base model
chat model | +| Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model | +| Yi
Yi1.5 | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model | +| InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model | +| DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math
DeepSeek-V2
DeepSeek-Coder-V2 | [DeepSeek series models](https://github.com/deepseek-ai) | Chinese
English | 1.3B-236B | base model
chat model
MoE model
code model
math model | +| MAMBA | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English | 130M-2.8B | base model | +| Gemma | [Google Gemma series models](https://github.com/google/gemma_pytorch) | English | 2B-7B | base model
instruct model | +| MiniCPM | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM) | Chinese
English | 2B-3B | chat model
MoE model | +| OpenBuddy | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy) | Chinese
English | 7B-67B | base model
chat model | +| Orion | [OrionStar AI series models](https://github.com/OrionStarAI) | Chinese
English | 14B | base model
chat model | +| BlueLM | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM) | Chinese
English | 7B | base model
chat model | +| Ziya2 | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese
English | 13B | base model
chat model | +| Skywork | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese
English | 13B | base model
chat model | +| Zephyr | Zephyr series models based on Mistral | English | 7B | chat model | +| PolyLM | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B | base model | +| SeqGPT | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M | semantic understanding model | +| SUS | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese
English | 34B | chat model | +| Tongyi-Finance | [Tongyi finance series models](https://github.com/QwenLM/Qwen) | Chinese
English | 14B | base model
chat model
financial model | +| CodeFuse-CodeLLaMA
CodeFuse-Codegeex2
CodeFuse-Qwen | [Ant CodeFuse series models](https://github.com/codefuse-ai) | Chinese
English | 6B-34B | chat model
code model | +| phi2/phi3 | Microsoft's PHI series models | English | 3B/4B | base model
instruct model
code model | +| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model | +| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese
English | 7B-12B | chat model | +| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model
chat model | +| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese
English | 13B | base model | +| c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model | +| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B
including quantized versions | chat model
MoE model | +| Atom | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model
chat model| +| Chinese-LLaMA-Alpaca-2 | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model
chat model
long text model | +| Chinese-LLaMA-Alpaca-3 | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model
chat model| +| ModelScope-Agent | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model | #### MLLMs From 8426c8b9b3ff8e1a13afdc094f39ac301943ce24 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 18 Jun 2024 17:12:19 +0800 Subject: [PATCH 12/12] fix lint --- swift/llm/utils/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 590fdd5e7..25c0db3b5 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -271,7 +271,7 @@ class ModelType: deepseek_coder_6_7b_instruct = 'deepseek-coder-6_7b-instruct' deepseek_coder_33b = 'deepseek-coder-33b' deepseek_coder_33b_instruct = 'deepseek-coder-33b-instruct' - #deepseek2-coder + # deepseek2-coder deepseek_coder_v2_instruct = 'deepseek-coder-v2-instruct' deepseek_coder_v2_lite_instruct = 'deepseek-coder-v2-lite-instruct' # deepseek-math