From 9fa6fdb7aaf234a4f5859e0ab08d219e4fb775ea Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 14:58:18 +0800
Subject: [PATCH 01/12] add deepseek coder2

---
 swift/llm/utils/model.py    | 23 +++++++++++++++++++++++
 swift/llm/utils/template.py |  8 ++++++++
 2 files changed, 31 insertions(+)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 735099f33..89109f959 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -271,6 +271,9 @@ class ModelType:
     deepseek_coder_6_7b_instruct = 'deepseek-coder-6_7b-instruct'
     deepseek_coder_33b = 'deepseek-coder-33b'
     deepseek_coder_33b_instruct = 'deepseek-coder-33b-instruct'
+    #deepseek2-coder
+    deepseek_coder_v2_instruct = 'deepseek-coder-v2-instruct'
+    deepseek_coder_v2_lite_instruct = 'deepseek-coder-v2-lite-instruct'
     # deepseek-math
     deepseek_math_7b = 'deepseek-math-7b'
     deepseek_math_7b_instruct = 'deepseek-math-7b-instruct'
@@ -1860,6 +1863,26 @@ def _output_device_map_hook(module, input, output):
     support_vllm=True,
     tags=['coding'],
     hf_model_id='deepseek-ai/deepseek-coder-33b-instruct')
+@register_model(
+    ModelType.deepseek_coder_v2_instruct,
+    'deepseek-ai/DeepSeek-Coder-V2-Instruct',
+    LoRATM.llama,
+    TemplateType.deepseek_coder2,
+    eos_token='<｜end▁of▁sentence｜>',
+    support_flash_attn=True,
+    support_vllm=True,
+    tags=['coding'],
+    hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct')
+@register_model(
+    ModelType.deepseek_coder_v2_lite_instruct,
+    'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
+    LoRATM.llama,
+    TemplateType.deepseek_coder2,
+    eos_token='<｜end▁of▁sentence｜>',
+    support_flash_attn=True,
+    support_vllm=True,
+    tags=['coding'],
+    hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
 @register_model(
     ModelType.openbuddy_deepseek_67b_chat,
     'OpenBuddy/openbuddy-deepseek-67b-v15.2',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 046753bc1..74d4500d3 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -64,6 +64,7 @@ class TemplateType:
     sus = 'sus'
     deepseek = 'deepseek'
     deepseek_coder = 'deepseek-coder'
+    deepseek_coder2 = 'deepseek-coder2'
     deepseek_vl = 'deepseek-vl'
     deepseek2 = 'deepseek2'
     codefuse_codellama = 'codefuse-codellama'
@@ -1311,6 +1312,13 @@ def __init__(self):
               'and other non-computer science questions, you will refuse to answer\n')))
 
 
+register_template(
+    TemplateType.deepseek_coder2,
+    Template(['<｜begin▁of▁sentence｜>{{SYSTEM}}'], ['User: {{QUERY}}\n\n Assistant: \n'],
+             ['<｜end▁of▁sentence｜>'], ['<｜end▁of▁sentence｜>'],
+             None))
+
+
 class LLavaTemplate(Template):
 
     def __init__(self):

From 7fb92514eb21beb540b6e48aa7ba32075b52194b Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 16:41:48 +0800
Subject: [PATCH 02/12] fix

---
 swift/llm/utils/model.py    | 40 ++++++++++++++++++-------------------
 swift/llm/utils/template.py |  7 -------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 89109f959..77896edc6 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1863,26 +1863,6 @@ def _output_device_map_hook(module, input, output):
     support_vllm=True,
     tags=['coding'],
     hf_model_id='deepseek-ai/deepseek-coder-33b-instruct')
-@register_model(
-    ModelType.deepseek_coder_v2_instruct,
-    'deepseek-ai/DeepSeek-Coder-V2-Instruct',
-    LoRATM.llama,
-    TemplateType.deepseek_coder2,
-    eos_token='<｜end▁of▁sentence｜>',
-    support_flash_attn=True,
-    support_vllm=True,
-    tags=['coding'],
-    hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct')
-@register_model(
-    ModelType.deepseek_coder_v2_lite_instruct,
-    'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
-    LoRATM.llama,
-    TemplateType.deepseek_coder2,
-    eos_token='<｜end▁of▁sentence｜>',
-    support_flash_attn=True,
-    support_vllm=True,
-    tags=['coding'],
-    hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
 @register_model(
     ModelType.openbuddy_deepseek_67b_chat,
     'OpenBuddy/openbuddy-deepseek-67b-v15.2',
@@ -3147,6 +3127,26 @@ def get_model_tokenizer_internlm2(model_dir: str,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.deepseek_coder_v2_instruct,
+    'deepseek-ai/DeepSeek-Coder-V2-Instruct',
+    LoRATM.deepseek2,
+    TemplateType.deepseek2,
+    eos_token='<｜end▁of▁sentence｜>',
+    support_flash_attn=True,
+    support_vllm=True,
+    tags=['coding'],
+    hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct')
+@register_model(
+    ModelType.deepseek_coder_v2_lite_instruct,
+    'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
+    LoRATM.deepseek2,
+    TemplateType.deepseek2,
+    eos_token='<｜end▁of▁sentence｜>',
+    support_flash_attn=True,
+    support_vllm=True,
+    tags=['coding'],
+    hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
 @register_model(
     ModelType.deepseek_v2_lite,
     'deepseek-ai/DeepSeek-V2-Lite',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 74d4500d3..3f37e2e8e 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -1312,13 +1312,6 @@ def __init__(self):
               'and other non-computer science questions, you will refuse to answer\n')))
 
 
-register_template(
-    TemplateType.deepseek_coder2,
-    Template(['<｜begin▁of▁sentence｜>{{SYSTEM}}'], ['User: {{QUERY}}\n\n Assistant: \n'],
-             ['<｜end▁of▁sentence｜>'], ['<｜end▁of▁sentence｜>'],
-             None))
-
-
 class LLavaTemplate(Template):
 
     def __init__(self):

From a8ceff299f5ed3a215a79dd1adcde952be8f7c7a Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 16:44:01 +0800
Subject: [PATCH 03/12] fix

---
 swift/llm/utils/model.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 77896edc6..c64446c3c 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -3132,27 +3132,26 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-Coder-V2-Instruct',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
-    eos_token='<｜end▁of▁sentence｜>',
     support_flash_attn=True,
     support_vllm=True,
     tags=['coding'],
+    requires=['transformers>=4.39.3'],
     hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Instruct')
 @register_model(
     ModelType.deepseek_coder_v2_lite_instruct,
     'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
-    eos_token='<｜end▁of▁sentence｜>',
     support_flash_attn=True,
     support_vllm=True,
     tags=['coding'],
+    requires=['transformers>=4.39.3'],
     hf_model_id='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
 @register_model(
     ModelType.deepseek_v2_lite,
     'deepseek-ai/DeepSeek-V2-Lite',
     LoRATM.deepseek2,
     TemplateType.default_generation,
-    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.39.3'],
@@ -3162,7 +3161,6 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-V2-Lite-Chat',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
-    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.39.3'],
@@ -3172,7 +3170,6 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-V2-Chat',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
-    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.39.3'],

From 01717453aaf53a688feb8b98af8a69e991ce28b2 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 16:57:05 +0800
Subject: [PATCH 04/12] fix

---
 swift/llm/utils/template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 3f37e2e8e..57b75e0b9 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -18,7 +18,7 @@
 
 DEFAULT_SYSTEM = 'You are a helpful assistant.'
 History = List[Union[Tuple[str, str], List[str]]]
-Prompt = List[Union[str, List[str]]]
+Prompt = List[Union[str, List[str], List[int]]]
 StopWords = Prompt
 Context = Union[str, List[int]]
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}

From 51551ce46c64cecd7f96bb086958510ae2664f61 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 16:58:06 +0800
Subject: [PATCH 05/12] fix

---
 swift/llm/utils/model.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index c64446c3c..590fdd5e7 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -3132,6 +3132,7 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-Coder-V2-Instruct',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
+    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     tags=['coding'],
@@ -3142,6 +3143,7 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
+    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     tags=['coding'],
@@ -3152,6 +3154,7 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-V2-Lite',
     LoRATM.deepseek2,
     TemplateType.default_generation,
+    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.39.3'],
@@ -3161,6 +3164,7 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-V2-Lite-Chat',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
+    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.39.3'],
@@ -3170,6 +3174,7 @@ def get_model_tokenizer_internlm2(model_dir: str,
     'deepseek-ai/DeepSeek-V2-Chat',
     LoRATM.deepseek2,
     TemplateType.deepseek2,
+    support_gradient_checkpointing=False,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.39.3'],

From 46fe8915791abb40ac8e5686d548c2617e28c88e Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:00:26 +0800
Subject: [PATCH 06/12] add doc

---
 README.md    | 1 +
 README_CN.md | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4a3d17def..363a07979 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ SWIFT has rich documentations for users, please check [here](https://github.com/
 SWIFT web-ui is available both on [Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) and [ModelScope studio](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary), please feel free to try!
 
 ## 🎉 News
+- 🔥2024.06.18: Supoprts **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin.
 - 🔥2024.06.16: Supoprts **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training!
 - 2024.06.11: Support for tool-calling agent deployment that conform to the OpenAI interface.You can refer to [Agent deployment best practice](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md)
 - 🔥2024.06.07: Support **Qwen2** series LLM, including Base and Instruct models of 0.5B, 1.5B, 7B, and 72B, as well as corresponding quantized versions gptq-int4, gptq-int8, and awq-int4. The best practice for self-cognition fine-tuning, inference and deployment of Qwen2-72B-Instruct using dual-card 80GiB A100 can be found [here](https://github.com/modelscope/swift/issues/1092).
diff --git a/README_CN.md b/README_CN.md
index daf21973f..58d122fbd 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -48,7 +48,8 @@ SWIFT具有丰富的文档体系，如有使用问题请请查看[这里](https:
 可以在[Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) 和 [ModelScope创空间](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary) 中体验SWIFT web-ui功能了。
 
 ## 🎉 新闻
-- 🔥2024.06.16: 支持**KTO**和**CPO**训练，使用`swift rlhf --rlhf_type kto`和`swift rlhf --rlhf_type cpo`来开始训练，可以参考[文档](./docs/source/LLM/人类偏好对齐训练文档.md)
+- 🔥2024.06.18: 支持**DeepSeek-Coder-v2**系列模型! 使用model_type`deepseek-coder-v2-instruct`和`deepseek-coder-v2-lite-instruct`来开启训练和推理.
+- 🔥2024.06.16: 支持**KTO**和**CPO**训练，使用`swift rlhf --rlhf_type kto`和`swift rlhf --rlhf_type cpo`来开始训练，可以参考[文档](./docs/source/LLM/人类偏好对齐训练文档.md).
 - 2024.06.11: 支持符合OpenAI接口的工具调用Agent部署, 可以查看[Agent部署最佳实践](docs/source/LLM/Agent部署最佳实践.md).
 - 🔥2024.06.07: 支持**Qwen2**系列LLM, 包括0.5B、1.5B、7B、72B的Base和Instruct模型, 以及对应的gptq-int4、gptq-int8、awq-int4量化版本. 使用双卡80GiB A100对Qwen2-72B-Instruct进行自我认知微调并推理部署的最佳实践可以查看[这里](https://github.com/modelscope/swift/issues/1092).
 - 🔥2024.06.05: 支持glm4系列大模型和glm4v-9b-chat多模态大模型, 可以查看[glm4v最佳实践](docs/source/Multi-Modal/glm4v最佳实践.md).

From 3dbc3eac14a7873cf9f2574f2407fcb6de175f37 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:01:20 +0800
Subject: [PATCH 07/12] fix typo

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 363a07979..389cc7157 100644
--- a/README.md
+++ b/README.md
@@ -47,12 +47,12 @@ SWIFT has rich documentations for users, please check [here](https://github.com/
 SWIFT web-ui is available both on [Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) and [ModelScope studio](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary), please feel free to try!
 
 ## 🎉 News
-- 🔥2024.06.18: Supoprts **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin.
-- 🔥2024.06.16: Supoprts **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training!
+- 🔥2024.06.18: Supports **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin.
+- 🔥2024.06.16: Supports **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training!
 - 2024.06.11: Support for tool-calling agent deployment that conform to the OpenAI interface.You can refer to [Agent deployment best practice](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md)
 - 🔥2024.06.07: Support **Qwen2** series LLM, including Base and Instruct models of 0.5B, 1.5B, 7B, and 72B, as well as corresponding quantized versions gptq-int4, gptq-int8, and awq-int4. The best practice for self-cognition fine-tuning, inference and deployment of Qwen2-72B-Instruct using dual-card 80GiB A100 can be found [here](https://github.com/modelscope/swift/issues/1092).
 - 🔥2024.06.05: Support for **glm4** series LLM and glm4v-9b-chat MLLM. You can refer to [glm4v best practice](docs/source_en/Multi-Modal/glm4v-best-practice.md).
-- 🔥2024.06.01: Supoprts **SimPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/SimPO.md) to start training!
+- 🔥2024.06.01: Supports **SimPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/SimPO.md) to start training!
 - 🔥2024.06.01: Support for deploying large multimodal models, please refer to the [Multimodal Deployment Documentation](docs/source_en/Multi-Modal/mutlimodal-deployment.md) for more information.
 - 2024.05.31: Supports Mini-Internvl model, Use model_type `mini-internvl-chat-2b-v1_5` and `mini-internvl-chat-4b-v1_5`to train.
 - 2024.05.24: Supports Phi3-vision model, Use model_type `phi3-vision-128k-instruct` to train.

From a170203c3bc55a03669a7b7b7f659d35475e6bab Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:02:30 +0800
Subject: [PATCH 08/12] add docs

---
 ...\213\345\222\214\346\225\260\346\215\256\351\233\206.md" | 6 ++++--
 docs/source_en/LLM/Supported-models-datasets.md             | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 155e44100..6a87b5d37 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -204,6 +204,8 @@
 |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
 |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
 |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
+|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
+|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
 |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
 |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
 |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
@@ -283,7 +285,7 @@
 |phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)|
 |phi3-4b-4k-instruct|[LLM-Research/Phi-3-mini-4k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-4k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2718;|transformers>=4.36|general|[microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)|
 |phi3-4b-128k-instruct|[LLM-Research/Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)|
-|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)|
+|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)|
 |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)|
 |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)|
 |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)|
@@ -308,7 +310,7 @@
 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
 | ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 5cb17c8a0..2876d5dd5 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -204,6 +204,8 @@ The table below introcudes all models supported by SWIFT:
 |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
 |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
 |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
+|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
+|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
 |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
 |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
 |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
@@ -283,7 +285,7 @@ The table below introcudes all models supported by SWIFT:
 |phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)|
 |phi3-4b-4k-instruct|[LLM-Research/Phi-3-mini-4k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-4k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2718;|transformers>=4.36|general|[microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)|
 |phi3-4b-128k-instruct|[LLM-Research/Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)|
-|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)|
+|phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)|
 |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|&#x2714;|&#x2714;|transformers>=4.36|general|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)|
 |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)|
 |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)|
@@ -308,7 +310,7 @@ The table below introcudes all models supported by SWIFT:
 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
 | ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|

From 743ba27cd8d6266b56082743322736d9af6aa48d Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:03:59 +0800
Subject: [PATCH 09/12] fix doc

---
 README.md    | 74 ++++++++++++++++++++++++++--------------------------
 README_CN.md |  2 +-
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 389cc7157..0f4efd02a 100644
--- a/README.md
+++ b/README.md
@@ -498,44 +498,44 @@ The complete list of supported models and datasets can be found at [Supported Mo
 
 #### LLMs
 
-| Model Type                                     | Model Introduction                                                     | Language           | Model Size                             | Model Type                                 |
+| Model Type                                                                                      | Model Introduction                                                     | Language           | Model Size                             | Model Type                                 |
 |------------------------------------------------|------------------------------------------------------------------------|--------------------|----------------------------------------|------------------------------------------- |
-| Qwen<br>Qwen1.5<br>Qwen2                            | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM)  | Chinese<br>English    | 0.5B-110B<br>including quantized versions | base model<br>chat model<br>MoE model<br>code model                      |
-| ChatGLM2<br>ChatGLM3<br>Codegeex2<br>GLM4           | [Zhipu ChatGLM series models](https://github.com/THUDM)               | Chinese<br>English    | 6B-9B                                     | base model<br>chat model<br>code model<br>long text model  |
-| Baichuan/Baichuan2                             | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc)           | Chinese<br>English    | 7B-13B<br>including quantized versions             | base model<br>chat model                       |
-| Yuan2                                          | [Langchao Yuan series models](https://github.com/IEIT-Yuan)             | Chinese<br>English    | 2B-102B                                | instruct model                                 |
-| XVerse                                         | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
-| LLaMA2                                         | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
-| LLaMA3                   | [LLaMA3 series models](https://github.com/meta-llama/llama3)  | English       | 8B-70B<br>including quantized versions      | base model<br>chat model              |
-| Mistral<br>Mixtral                            | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
-| Yi<br>Yi1.5                                      | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
-| InternLM<br>InternLM2<br>InternLM2-Math              | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
-| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2          | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
-| MAMBA                                          | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
-| Gemma                                          | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-7B                                  | base model<br>instruct model                       |
-| MiniCPM                                        | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM)     | Chinese<br>English    | 2B-3B                                  | chat model<br>MoE model                                 |
-| OpenBuddy                                      | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy)       | Chinese<br>English    | 7B-67B                                 | base model<br>chat model                       |
-| Orion                                          | [OrionStar AI series models](https://github.com/OrionStarAI)            | Chinese<br>English    | 14B                                    | base model<br>chat model                       |
-| BlueLM                                         | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM)        | Chinese<br>English    | 7B                                     | base model<br>chat model                       |
-| Ziya2                                          | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese<br>English  | 13B                                    | base model<br>chat model                       |
-| Skywork                                        | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese<br>English    | 13B                                    | base model<br>chat model                       |
-| Zephyr                                         | Zephyr series models based on Mistral                                  | English            | 7B                                     | chat model                                 |
-| PolyLM                                         | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B                                 | base model                                 |
-| SeqGPT                                         | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M                               | semantic understanding model                |
-| SUS                                            | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese<br>English | 34B                              | chat model                                 |
-| Tongyi-Finance                                 | [Tongyi finance series models](https://github.com/QwenLM/Qwen)          | Chinese<br>English    | 14B                                    | base model<br>chat model<br>financial model        |
-| CodeFuse-CodeLLaMA<br>CodeFuse-Codegeex2<br>CodeFuse-Qwen | [Ant CodeFuse series models](https://github.com/codefuse-ai)        | Chinese<br>English    | 6B-34B                                 | chat model<br>code model                      |
-| phi2/phi3                                | Microsoft's PHI series models    | English            | 3B/4B                               | base model<br>instruct model<br>code model      |
-| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model |
-| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese<br>English | 7B-12B | chat model |
-| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
-| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
-| c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model  |
-| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B<br>including quantized versions | chat model<br>MoE model |
-| Atom | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model<br>chat model|
-| Chinese-LLaMA-Alpaca-2 | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model<br>chat model<br>long text model |
-| Chinese-LLaMA-Alpaca-3 | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model<br>chat model|
-| ModelScope-Agent | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model |
+| Qwen<br>Qwen1.5<br>Qwen2                                                                        | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM)  | Chinese<br>English    | 0.5B-110B<br>including quantized versions | base model<br>chat model<br>MoE model<br>code model                      |
+| ChatGLM2<br>ChatGLM3<br>Codegeex2<br>GLM4                                                       | [Zhipu ChatGLM series models](https://github.com/THUDM)               | Chinese<br>English    | 6B-9B                                     | base model<br>chat model<br>code model<br>long text model  |
+| Baichuan/Baichuan2                                                                              | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc)           | Chinese<br>English    | 7B-13B<br>including quantized versions             | base model<br>chat model                       |
+| Yuan2                                                                                           | [Langchao Yuan series models](https://github.com/IEIT-Yuan)             | Chinese<br>English    | 2B-102B                                | instruct model                                 |
+| XVerse                                                                                          | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
+| LLaMA2                                                                                          | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
+| LLaMA3                                                                                          | [LLaMA3 series models](https://github.com/meta-llama/llama3)  | English       | 8B-70B<br>including quantized versions      | base model<br>chat model              |
+| Mistral<br>Mixtral                                                                              | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
+| Yi<br>Yi1.5                                                                                     | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
+| InternLM<br>InternLM2<br>InternLM2-Math                                                         | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
+| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2<br>DeepSeek-Coder-V2 | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
+| MAMBA                                                                                           | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
+| Gemma                                                                                           | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-7B                                  | base model<br>instruct model                       |
+| MiniCPM                                                                                         | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM)     | Chinese<br>English    | 2B-3B                                  | chat model<br>MoE model                                 |
+| OpenBuddy                                                                                       | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy)       | Chinese<br>English    | 7B-67B                                 | base model<br>chat model                       |
+| Orion                                                                                           | [OrionStar AI series models](https://github.com/OrionStarAI)            | Chinese<br>English    | 14B                                    | base model<br>chat model                       |
+| BlueLM                                                                                          | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM)        | Chinese<br>English    | 7B                                     | base model<br>chat model                       |
+| Ziya2                                                                                           | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese<br>English  | 13B                                    | base model<br>chat model                       |
+| Skywork                                                                                         | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese<br>English    | 13B                                    | base model<br>chat model                       |
+| Zephyr                                                                                          | Zephyr series models based on Mistral                                  | English            | 7B                                     | chat model                                 |
+| PolyLM                                                                                          | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B                                 | base model                                 |
+| SeqGPT                                                                                          | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M                               | semantic understanding model                |
+| SUS                                                                                             | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese<br>English | 34B                              | chat model                                 |
+| Tongyi-Finance                                                                                  | [Tongyi finance series models](https://github.com/QwenLM/Qwen)          | Chinese<br>English    | 14B                                    | base model<br>chat model<br>financial model        |
+| CodeFuse-CodeLLaMA<br>CodeFuse-Codegeex2<br>CodeFuse-Qwen                                       | [Ant CodeFuse series models](https://github.com/codefuse-ai)        | Chinese<br>English    | 6B-34B                                 | chat model<br>code model                      |
+| phi2/phi3                                                                                       | Microsoft's PHI series models    | English            | 3B/4B                               | base model<br>instruct model<br>code model      |
+| Grok                                                                                            | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model |
+| TeleChat                                                                                        | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese<br>English | 7B-12B | chat model |
+| dbrx                                                                                            | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
+| mengzi3                                                                                         | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
+| c4ai-command-r                                                                                  | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model  |
+| WizardLM2                                                                                       | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B<br>including quantized versions | chat model<br>MoE model |
+| Atom                                                                                            | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model<br>chat model|
+| Chinese-LLaMA-Alpaca-2                                                                          | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model<br>chat model<br>long text model |
+| Chinese-LLaMA-Alpaca-3                                                                          | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model<br>chat model|
+| ModelScope-Agent                                                                                | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model |
 
 #### MLLMs
 
diff --git a/README_CN.md b/README_CN.md
index 58d122fbd..6c7265fc3 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -506,7 +506,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Mistral<br>Mixtral                                 | [Mistral系列模型](https://github.com/mistralai/mistral-src)  | 英文       | 7B-8x22B | base模型<br>instruct模型<br>MoE模型             |
 | Yi<br>Yi1.5                                    | [01AI的YI系列模型](https://github.com/01-ai)                 | 中文<br>英文 | 6B-34B<br>包含量化版本          | base模型<br>chat模型<br>长文本模型                 |
 | InternLM<br>InternLM2<br>InternLM2-Math                   | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文<br>英文 | 1.8B-20B                  | base模型<br>chat模型<br>数学模型                  |
-| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2       | [幻方系列模型](https://github.com/deepseek-ai)               | 中文<br>英文 | 1.3B-236B                  | base模型<br>chat模型<br>MoE模型<br>代码模型<br>数学模型 |
+| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2<br>DeepSeek-Coder-V2       | [幻方系列模型](https://github.com/deepseek-ai)               | 中文<br>英文 | 1.3B-236B                  | base模型<br>chat模型<br>MoE模型<br>代码模型<br>数学模型 |
 | MAMBA                                               | [MAMBA时序卷积模型](https://github.com/state-spaces/mamba)   | 英文       | 130M-2.8B                 | base模型                                    |
 | Gemma                                               | [Google Gemma系列模型](https://github.com/google/gemma_pytorch) | 英文       | 2B-7B                     | base模型<br>instruct模型                      |
 | MiniCPM                                             | [OpenBmB MiniCPM系列模型](https://github.com/OpenBMB/MiniCPM) | 中文<br>英文 | 2B-3B                     | chat模型<br>MoE模型                                    |

From f95ae9e7a20c759a4e1fbac08497bc548597914c Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:06:03 +0800
Subject: [PATCH 10/12] fix

---
 swift/llm/utils/template.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 57b75e0b9..f09d4c4fb 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -64,7 +64,6 @@ class TemplateType:
     sus = 'sus'
     deepseek = 'deepseek'
     deepseek_coder = 'deepseek-coder'
-    deepseek_coder2 = 'deepseek-coder2'
     deepseek_vl = 'deepseek-vl'
     deepseek2 = 'deepseek2'
     codefuse_codellama = 'codefuse-codellama'

From 725aea671287c94344b98b66b3302a87d1fb5b33 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:06:55 +0800
Subject: [PATCH 11/12] fix doc

---
 README.md | 74 +++++++++++++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index 0f4efd02a..93ff61ec1 100644
--- a/README.md
+++ b/README.md
@@ -498,44 +498,44 @@ The complete list of supported models and datasets can be found at [Supported Mo
 
 #### LLMs
 
-| Model Type                                                                                      | Model Introduction                                                     | Language           | Model Size                             | Model Type                                 |
+| Model Type                                     | Model Introduction                                                     | Language           | Model Size                             | Model Type                                 |
 |------------------------------------------------|------------------------------------------------------------------------|--------------------|----------------------------------------|------------------------------------------- |
-| Qwen<br>Qwen1.5<br>Qwen2                                                                        | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM)  | Chinese<br>English    | 0.5B-110B<br>including quantized versions | base model<br>chat model<br>MoE model<br>code model                      |
-| ChatGLM2<br>ChatGLM3<br>Codegeex2<br>GLM4                                                       | [Zhipu ChatGLM series models](https://github.com/THUDM)               | Chinese<br>English    | 6B-9B                                     | base model<br>chat model<br>code model<br>long text model  |
-| Baichuan/Baichuan2                                                                              | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc)           | Chinese<br>English    | 7B-13B<br>including quantized versions             | base model<br>chat model                       |
-| Yuan2                                                                                           | [Langchao Yuan series models](https://github.com/IEIT-Yuan)             | Chinese<br>English    | 2B-102B                                | instruct model                                 |
-| XVerse                                                                                          | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
-| LLaMA2                                                                                          | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
-| LLaMA3                                                                                          | [LLaMA3 series models](https://github.com/meta-llama/llama3)  | English       | 8B-70B<br>including quantized versions      | base model<br>chat model              |
-| Mistral<br>Mixtral                                                                              | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
-| Yi<br>Yi1.5                                                                                     | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
-| InternLM<br>InternLM2<br>InternLM2-Math                                                         | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
-| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2<br>DeepSeek-Coder-V2 | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
-| MAMBA                                                                                           | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
-| Gemma                                                                                           | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-7B                                  | base model<br>instruct model                       |
-| MiniCPM                                                                                         | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM)     | Chinese<br>English    | 2B-3B                                  | chat model<br>MoE model                                 |
-| OpenBuddy                                                                                       | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy)       | Chinese<br>English    | 7B-67B                                 | base model<br>chat model                       |
-| Orion                                                                                           | [OrionStar AI series models](https://github.com/OrionStarAI)            | Chinese<br>English    | 14B                                    | base model<br>chat model                       |
-| BlueLM                                                                                          | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM)        | Chinese<br>English    | 7B                                     | base model<br>chat model                       |
-| Ziya2                                                                                           | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese<br>English  | 13B                                    | base model<br>chat model                       |
-| Skywork                                                                                         | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese<br>English    | 13B                                    | base model<br>chat model                       |
-| Zephyr                                                                                          | Zephyr series models based on Mistral                                  | English            | 7B                                     | chat model                                 |
-| PolyLM                                                                                          | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B                                 | base model                                 |
-| SeqGPT                                                                                          | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M                               | semantic understanding model                |
-| SUS                                                                                             | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese<br>English | 34B                              | chat model                                 |
-| Tongyi-Finance                                                                                  | [Tongyi finance series models](https://github.com/QwenLM/Qwen)          | Chinese<br>English    | 14B                                    | base model<br>chat model<br>financial model        |
-| CodeFuse-CodeLLaMA<br>CodeFuse-Codegeex2<br>CodeFuse-Qwen                                       | [Ant CodeFuse series models](https://github.com/codefuse-ai)        | Chinese<br>English    | 6B-34B                                 | chat model<br>code model                      |
-| phi2/phi3                                                                                       | Microsoft's PHI series models    | English            | 3B/4B                               | base model<br>instruct model<br>code model      |
-| Grok                                                                                            | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model |
-| TeleChat                                                                                        | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese<br>English | 7B-12B | chat model |
-| dbrx                                                                                            | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
-| mengzi3                                                                                         | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
-| c4ai-command-r                                                                                  | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model  |
-| WizardLM2                                                                                       | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B<br>including quantized versions | chat model<br>MoE model |
-| Atom                                                                                            | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model<br>chat model|
-| Chinese-LLaMA-Alpaca-2                                                                          | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model<br>chat model<br>long text model |
-| Chinese-LLaMA-Alpaca-3                                                                          | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model<br>chat model|
-| ModelScope-Agent                                                                                | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model |
+| Qwen<br>Qwen1.5<br>Qwen2                            | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM)  | Chinese<br>English    | 0.5B-110B<br>including quantized versions | base model<br>chat model<br>MoE model<br>code model                      |
+| ChatGLM2<br>ChatGLM3<br>Codegeex2<br>GLM4           | [Zhipu ChatGLM series models](https://github.com/THUDM)               | Chinese<br>English    | 6B-9B                                     | base model<br>chat model<br>code model<br>long text model  |
+| Baichuan/Baichuan2                             | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc)           | Chinese<br>English    | 7B-13B<br>including quantized versions             | base model<br>chat model                       |
+| Yuan2                                          | [Langchao Yuan series models](https://github.com/IEIT-Yuan)             | Chinese<br>English    | 2B-102B                                | instruct model                                 |
+| XVerse                                         | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
+| LLaMA2                                         | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
+| LLaMA3                   | [LLaMA3 series models](https://github.com/meta-llama/llama3)  | English       | 8B-70B<br>including quantized versions      | base model<br>chat model              |
+| Mistral<br>Mixtral                            | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
+| Yi<br>Yi1.5                                      | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
+| InternLM<br>InternLM2<br>InternLM2-Math              | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
+| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2<br>DeepSeek-Coder-V2          | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
+| MAMBA                                          | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
+| Gemma                                          | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-7B                                  | base model<br>instruct model                       |
+| MiniCPM                                        | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM)     | Chinese<br>English    | 2B-3B                                  | chat model<br>MoE model                                 |
+| OpenBuddy                                      | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy)       | Chinese<br>English    | 7B-67B                                 | base model<br>chat model                       |
+| Orion                                          | [OrionStar AI series models](https://github.com/OrionStarAI)            | Chinese<br>English    | 14B                                    | base model<br>chat model                       |
+| BlueLM                                         | [VIVO BlueLM large model](https://github.com/vivo-ai-lab/BlueLM)        | Chinese<br>English    | 7B                                     | base model<br>chat model                       |
+| Ziya2                                          | [Fengshenbang series models](https://github.com/IDEA-CCNL/Fengshenbang-LM) | Chinese<br>English  | 13B                                    | base model<br>chat model                       |
+| Skywork                                        | [Skywork series models](https://github.com/SkyworkAI/Skywork) | Chinese<br>English    | 13B                                    | base model<br>chat model                       |
+| Zephyr                                         | Zephyr series models based on Mistral                                  | English            | 7B                                     | chat model                                 |
+| PolyLM                                         | [Tongyi Lab self-developed PolyLM series models](https://github.com/DAMO-NLP-MT/PolyLM) | Multilingual | 13B                                 | base model                                 |
+| SeqGPT                                         | [Tongyi Lab self-developed text understanding model for information extraction and text classification](https://github.com/Alibaba-NLP/SeqGPT) | Chinese | 560M                               | semantic understanding model                |
+| SUS                                            | [Southern University of Science and Technology model fine-tuned on YI](https://github.com/SUSTech-IDEA/SUS-Chat) | Chinese<br>English | 34B                              | chat model                                 |
+| Tongyi-Finance                                 | [Tongyi finance series models](https://github.com/QwenLM/Qwen)          | Chinese<br>English    | 14B                                    | base model<br>chat model<br>financial model        |
+| CodeFuse-CodeLLaMA<br>CodeFuse-Codegeex2<br>CodeFuse-Qwen | [Ant CodeFuse series models](https://github.com/codefuse-ai)        | Chinese<br>English    | 6B-34B                                 | chat model<br>code model                      |
+| phi2/phi3                                | Microsoft's PHI series models    | English            | 3B/4B                               | base model<br>instruct model<br>code model      |
+| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model |
+| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese<br>English | 7B-12B | chat model |
+| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
+| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
+| c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model  |
+| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B<br>including quantized versions | chat model<br>MoE model |
+| Atom | [Atom](https://github.com/LlamaFamily/Llama-Chinese) | Chinese | 7B| base model<br>chat model|
+| Chinese-LLaMA-Alpaca-2 | [Chinese-LLaMA-Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | Chinese | 1.3B-13B| base model<br>chat model<br>long text model |
+| Chinese-LLaMA-Alpaca-3 | [Chinese-LLaMA-Alpaca-3](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) | Chinese | 8B| base model<br>chat model|
+| ModelScope-Agent | [ModelScope Agent series models](https://github.com/modelscope/modelscope-agent) | Chinese | 7B-14B| agent model |
 
 #### MLLMs
 

From 8426c8b9b3ff8e1a13afdc094f39ac301943ce24 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 18 Jun 2024 17:12:19 +0800
Subject: [PATCH 12/12] fix lint

---
 swift/llm/utils/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 590fdd5e7..25c0db3b5 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -271,7 +271,7 @@ class ModelType:
     deepseek_coder_6_7b_instruct = 'deepseek-coder-6_7b-instruct'
     deepseek_coder_33b = 'deepseek-coder-33b'
     deepseek_coder_33b_instruct = 'deepseek-coder-33b-instruct'
-    #deepseek2-coder
+    # deepseek2-coder
     deepseek_coder_v2_instruct = 'deepseek-coder-v2-instruct'
     deepseek_coder_v2_lite_instruct = 'deepseek-coder-v2-lite-instruct'
     # deepseek-math