From 285ac51633b6756e650b4bd75a0fb1b5e7338330 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 9 Aug 2024 16:11:34 +0800
Subject: [PATCH] support qwen2-audio (#1633)

---
 README.md                                     |  3 +-
 README_CN.md                                  |  3 +-
 ...37\344\270\216\351\203\250\347\275\262.md" |  2 +
 ...14\346\225\260\346\215\256\351\233\206.md" |  8 ++-
 docs/source/Multi-Modal/index.md              |  4 +-
 ...00\344\275\263\345\256\236\350\267\265.md" |  2 +
 ...00\344\275\263\345\256\236\350\267\265.md" |  2 +
 .../LLM/Supported-models-datasets.md          |  8 ++-
 ...M-inference-acceleration-and-deployment.md |  2 +
 docs/source_en/Multi-Modal/index.md           |  4 +-
 .../Multi-Modal/minicpm-v-best-practice.md    |  2 +
 .../Multi-Modal/qwen-audio-best-practice.md   |  2 +
 swift/llm/utils/media.py                      |  2 +-
 swift/llm/utils/model.py                      | 35 ++++++++++
 swift/llm/utils/template.py                   | 67 +++++++++++++++++--
 swift/llm/utils/vision_utils.py               |  4 +-
 tests/custom/test_megatron.py                 |  2 +-
 17 files changed, 132 insertions(+), 20 deletions(-)
diff --git a/README.md b/README.md
index 60c8afbf4..2764111b6 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,7 @@ You can contact us and communicate with us by adding our group:
 <img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
 
 ## 🎉 News
+- 🔥2024.08.09: Support for inference and fine-tuning of the qwen2-audio model. Best practice can be found [here](https://github.com/modelscope/ms-swift/issues/1653).
 - 🔥2024.08.08: Supports the qwen2-math series models: 1.5B, 7B, 72B. Use `swift infer --model_type qwen2-math-1_5b-instruct` for an experience.
 - 🔥2024.08.07: Support for using vLLM for accelerating inference and deployment of multimodal large models such as the llava series and phi3-vision models. You can refer to the [Multimodal & vLLM Inference Acceleration Documentation](docs/source_en/Multi-Modal/vllm-inference-acceleration.md) for more information.
 - 2024.08.06: Support for minicpm-v-v2_6-chat is available. You can use `swift infer --model_type minicpm-v-v2_6-chat` for inference experience. Best practices can be found [here](https://github.com/modelscope/swift/issues/1613).
@@ -609,7 +610,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 | Model Type                                              | Model Introduction                                                                     | Language           | Model Size                            | Model Type               |
 |---------------------------------------------------------|----------------------------------------------------------------------------------------|--------------------|---------------------------------------|--------------------------|
 | Qwen-VL                                                 | [Tongyi Qwen vision model](https://github.com/QwenLM)                                  | Chinese<br>English | 7B<br>including quantized versions    | base model<br>chat model |
-| Qwen-Audio                                              | [Tongyi Qwen speech model](https://github.com/QwenLM)                                  | Chinese<br>English | 7B                                    | base model<br>chat model |
+| Qwen-Audio<br>Qwen2-Audio                           | [Tongyi Qwen speech model](https://github.com/QwenLM)                                  | Chinese<br>English | 7B                                    | base model<br>chat model |
 | YI-VL                                                   | [01AI's YI series vision models](https://github.com/01-ai)                             | Chinese<br>English | 6B-34B                                | chat model               |
 | XComposer2<br>XComposer2.5                              | [Pujiang AI Lab InternLM vision model](https://github.com/InternLM/InternLM-XComposer) | Chinese<br>English | 7B                                    | chat model               |
 | DeepSeek-VL                                             | [DeepSeek series vision models](https://github.com/deepseek-ai)                        | Chinese<br>English | 1.3B-7B                               | chat model               |
diff --git a/README_CN.md b/README_CN.md
index 26b8240f2..6b44a0e38 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -56,6 +56,7 @@ SWIFT具有丰富全面的文档，请查看我们的文档网站:
 
 
 ## 🎉 新闻
+- 🔥2024.08.09: 支持qwen2-audio模型的推理与微调. 最佳实践可以查看[这里](https://github.com/modelscope/ms-swift/issues/1653).
 - 🔥2024.08.08: 支持qwen2-math系列模型, 1.5B, 7B, 72B. 使用`swift infer --model_type qwen2-math-1_5b-instruct`进行体验.
 - 🔥2024.08.07: 支持使用vllm对多模态大模型: llava系列, internvl2系列, phi3-vision, minicpm-v2.5进行推理加速和部署. 可以查看[多模态&vLLM推理加速文档](docs/source/Multi-Modal/vLLM推理加速文档.md)获取更多信息.
 - 2024.08.06: 支持minicpm-v-v2_6-chat, 使用`swift infer --model_type minicpm-v-v2_6-chat`进行推理体验, 最佳实践可以查看[这里](https://github.com/modelscope/swift/issues/1613).
@@ -603,7 +604,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | 模型类型                                                    | 模型介绍                                                                       | 语言       | 模型大小             | 模型类型             |
 |---------------------------------------------------------|----------------------------------------------------------------------------|----------|------------------|------------------|
 | Qwen-VL                                                 | [通义千问视觉模型](https://github.com/QwenLM)                                      | 中文<br>英文 | 7B<br>包含量化版本     | base模型<br>chat模型 |
-| Qwen-Audio                                              | [通义千问语音模型](https://github.com/QwenLM)                                      | 中文<br>英文 | 7B               | base模型<br>chat模型 |
+| Qwen-Audio<br>Qwen2-Audio                       | [通义千问语音模型](https://github.com/QwenLM)                                      | 中文<br>英文 | 7B               | base模型<br>chat模型 |
 | YI-VL                                                   | [01AI的YI系列视觉模型](https://github.com/01-ai)                                  | 中文<br>英文 | 6B-34B           | chat模型           |
 | XComposer2<br>XComposer2.5                              | [浦江实验室书生浦语视觉模型](https://github.com/InternLM/InternLM-XComposer)            | 中文<br>英文 | 7B               | chat模型           |
 | DeepSeek-VL                                             | [幻方系列视觉模型](https://github.com/deepseek-ai)                                 | 中文<br>英文 | 1.3B-7B          | chat模型           |
diff --git "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md" "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
index cfc9141c9..d9686e27b 100644
--- "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
+++ "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
@@ -2,6 +2,8 @@
 # VLLM推理加速与部署
 vllm支持的模型可以查看[支持的模型](支持的模型和数据集.md#模型).
 
+llama3.1 405b推理加速与部署最佳实践可以查看[这里](https://github.com/modelscope/ms-swift/issues/1484)
+
 ## 目录
 - [环境准备](#环境准备)
 - [推理加速](#推理加速)
diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 443815cf2..2af36f3b1 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -360,6 +360,8 @@
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
+|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
 |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
 |llava1_5-13b-instruct|[swift/llava-1.5-13b-hf](https://modelscope.cn/models/swift/llava-1.5-13b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
@@ -377,9 +379,9 @@
 |yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|yi-vl|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.34|vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
 |yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|yi-vl|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.34|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
 |llava-llama-3-8b-v1_1|[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-llama-instruct|&#x2714;|&#x2718;|&#x2714;|&#x2718;|transformers>=4.36|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
-|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|^(model\|vision_proj)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internlm-xcomposer2|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
-|internlm-xcomposer2-4khd-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b/summary)|^(model\|vision_proj)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internlm-xcomposer2-4khd|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
-|internlm-xcomposer2_5-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b/summary)|^(model\|vision_proj)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internlm-xcomposer2_5|&#x2714;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
+|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|attention.wqkv, attention.wo, feed_forward.w1, feed_forward.w2, feed_forward.w3|internlm-xcomposer2|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
+|internlm-xcomposer2-4khd-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b/summary)|attention.wqkv, attention.wo, feed_forward.w1, feed_forward.w2, feed_forward.w3|internlm-xcomposer2-4khd|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
+|internlm-xcomposer2_5-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b/summary)|attention.wqkv, attention.wo, feed_forward.w1, feed_forward.w2, feed_forward.w3|internlm-xcomposer2_5|&#x2714;|&#x2718;|&#x2714;|&#x2718;|decord|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
 |internvl-chat-v1_5|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl|&#x2714;|&#x2718;|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl|&#x2714;|&#x2718;|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md
index c323637df..071a89372 100644
--- a/docs/source/Multi-Modal/index.md
+++ b/docs/source/Multi-Modal/index.md
@@ -11,7 +11,7 @@
 
 一轮对话可以包含多张图片（或不含图片）:
 1. [Qwen-VL最佳实践](qwen-vl最佳实践.md)
-2. [Qwen-Audio最佳实践](qwen-audio最佳实践.md)
+2. [Qwen-Audio最佳实践](qwen-audio最佳实践.md), [Qwen2-Audio最佳实践](https://github.com/modelscope/ms-swift/issues/1653)
 3. [Llava最佳实践](llava最佳实践.md), [LLava Video最佳实践](llava-video最佳实践.md)
 4. [InternVL系列最佳实践](internvl最佳实践.md)
 5. [Deepseek-VL最佳实践](deepseek-vl最佳实践.md)
@@ -26,4 +26,4 @@
 
 整个对话围绕一张图片（可能可以不含图片）:
 1. [CogVLM最佳实践](cogvlm最佳实践.md), [CogVLM2最佳实践](cogvlm2最佳实践.md), [glm4v最佳实践](glm4v最佳实践.md), [CogVLM2-Video最佳实践](cogvlm2-video最佳实践.md)
-2. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md), [MiniCPM-V-2.5最佳实践](minicpm-v-2.5最佳实践.md)
+2. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md), [MiniCPM-V-2.5最佳实践](minicpm-v-2.5最佳实践.md), [MiniCPM-V-2.6最佳实践](https://github.com/modelscope/ms-swift/issues/1613)
diff --git "a/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 1a546a737..dccfaae35 100644
--- "a/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -1,6 +1,8 @@
 
 # MiniCPM-V-2.5 最佳实践
 
+MiniCPM-V-2.6 最佳实践: [https://github.com/modelscope/ms-swift/issues/1613](https://github.com/modelscope/ms-swift/issues/1613)
+
 ## 目录
 - [环境准备](#环境准备)
 - [推理](#推理)
diff --git "a/docs/source/Multi-Modal/qwen-audio\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/qwen-audio\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 45e37be76..7765b2703 100644
--- "a/docs/source/Multi-Modal/qwen-audio\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/qwen-audio\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -1,5 +1,7 @@
 # Qwen-Audio 最佳实践
 
+Qwen2-Audio的最佳实践可以查看: [https://github.com/modelscope/ms-swift/issues/1653](https://github.com/modelscope/ms-swift/issues/1653)
+
 ## 目录
 - [环境准备](#环境准备)
 - [推理](#推理)
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index f02ac14b2..69b869809 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -360,6 +360,8 @@ The table below introcudes all models supported by SWIFT:
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
+|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
 |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
 |llava1_5-13b-instruct|[swift/llava-1.5-13b-hf](https://modelscope.cn/models/swift/llava-1.5-13b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
@@ -377,9 +379,9 @@ The table below introcudes all models supported by SWIFT:
 |yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|yi-vl|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.34|vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
 |yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|yi-vl|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.34|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
 |llava-llama-3-8b-v1_1|[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-llama-instruct|&#x2714;|&#x2718;|&#x2714;|&#x2718;|transformers>=4.36|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
-|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|^(model\|vision_proj)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internlm-xcomposer2|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
-|internlm-xcomposer2-4khd-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b/summary)|^(model\|vision_proj)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internlm-xcomposer2-4khd|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
-|internlm-xcomposer2_5-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b/summary)|^(model\|vision_proj)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internlm-xcomposer2_5|&#x2714;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
+|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|attention.wqkv, attention.wo, feed_forward.w1, feed_forward.w2, feed_forward.w3|internlm-xcomposer2|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
+|internlm-xcomposer2-4khd-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b/summary)|attention.wqkv, attention.wo, feed_forward.w1, feed_forward.w2, feed_forward.w3|internlm-xcomposer2-4khd|&#x2718;|&#x2718;|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
+|internlm-xcomposer2_5-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b/summary)|attention.wqkv, attention.wo, feed_forward.w1, feed_forward.w2, feed_forward.w3|internlm-xcomposer2_5|&#x2714;|&#x2718;|&#x2714;|&#x2718;|decord|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
 |internvl-chat-v1_5|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl|&#x2714;|&#x2718;|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl|&#x2714;|&#x2718;|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
diff --git a/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md b/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md
index 037286d84..137968f3b 100644
--- a/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md
+++ b/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md
@@ -1,6 +1,8 @@
 # VLLM Inference Acceleration and Deployment
 The models supported by vllm can be found in [Supported Models](Supported-models-datasets.md#Models).
 
+You can check the best practices for inference acceleration and deployment of Llama 3.1 405b [here](https://github.com/modelscope/ms-swift/issues/1484).
+
 ## Table of Contents
 - [Environment Preparation](#environment-preparation)
 - [Inference Acceleration](#inference-acceleration)
diff --git a/docs/source_en/Multi-Modal/index.md b/docs/source_en/Multi-Modal/index.md
index 4b0650717..874bcee91 100644
--- a/docs/source_en/Multi-Modal/index.md
+++ b/docs/source_en/Multi-Modal/index.md
@@ -11,7 +11,7 @@
 
 A single round of dialogue can contain multiple images (or no images):
 1. [Qwen-VL Best Practice](qwen-vl-best-practice.md)
-2. [Qwen-Audio Best Practice](qwen-audio-best-practice.md)
+2. [Qwen-Audio Best Practice](qwen-audio-best-practice.md), [Qwen2-Audio最佳实践](https://github.com/modelscope/ms-swift/issues/1653)
 3. [Llava Best Practice](llava-best-practice.md), [LLava Video Best Practice](llava-video-best-practice.md)
 4. [InternVL Series Best Practice](internvl-best-practice.md)
 5. [Deepseek-VL Best Practice](deepseek-vl-best-practice.md)
@@ -25,4 +25,4 @@ A single round of dialogue can only contain one image:
 
 The entire conversation revolves around one image.
 1. [CogVLM Best Practice](cogvlm-best-practice.md), [CogVLM2 Best Practice](cogvlm2-best-practice.md), [GLM4V Best Practice](glm4v-best-practice.md), [CogVLM2-Video Best Practice](cogvlm2-video-best-practice.md)
-2. [MiniCPM-V Best Practice](minicpm-v-best-practice.md)
+2. [MiniCPM-V Best Practice](minicpm-v-best-practice.md), [MiniCPM-V-2.6 Best Practice](https://github.com/modelscope/ms-swift/issues/1613)
diff --git a/docs/source_en/Multi-Modal/minicpm-v-best-practice.md b/docs/source_en/Multi-Modal/minicpm-v-best-practice.md
index 71c22b49d..45911e8ca 100644
--- a/docs/source_en/Multi-Modal/minicpm-v-best-practice.md
+++ b/docs/source_en/Multi-Modal/minicpm-v-best-practice.md
@@ -1,6 +1,8 @@
 # MiniCPM-V Best Practice
 Using minicpm-v-3b-chat as an example, if you want to use the updated version of the MiniCPM-V multimodal model (v2), you can switch `--model_type minicpm-v-3b-chat` to `--model_type minicpm-v-v2-chat`.
 
+MiniCPM-V-2.6 Best Practice: [https://github.com/modelscope/ms-swift/issues/1613](https://github.com/modelscope/ms-swift/issues/1613)
+
 ## Table of Contents
 - [Environment Setup](#environment-setup)
 - [Inference](#inference)
diff --git a/docs/source_en/Multi-Modal/qwen-audio-best-practice.md b/docs/source_en/Multi-Modal/qwen-audio-best-practice.md
index 549d44b27..d5e545fbc 100644
--- a/docs/source_en/Multi-Modal/qwen-audio-best-practice.md
+++ b/docs/source_en/Multi-Modal/qwen-audio-best-practice.md
@@ -1,5 +1,7 @@
 # Qwen-Audio Best Practice
 
+Best practice for Qwen2-Audio can be found at: [https://github.com/modelscope/ms-swift/issues/1653](https://github.com/modelscope/ms-swift/issues/1653).
+
 ## Table of Contents
 - [Environment Setup](#environment-setup)
 - [Inference](#inference)
diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
index 485c50182..3ceeaa364 100644
--- a/swift/llm/utils/media.py
+++ b/swift/llm/utils/media.py
@@ -49,7 +49,7 @@ class MediaTag:
 
     standard_tags = {
         'image': '<image>',
-        'audio': '<audio_label>',
+        'audio': '<audio>',
         'video': '<video>',
     }
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index d6b4439ad..9c5ed983c 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -147,6 +147,8 @@ class ModelType:
     # qwen-audio
     qwen_audio = 'qwen-audio'
     qwen_audio_chat = 'qwen-audio-chat'
+    qwen2_audio_7b = 'qwen2-audio-7b'
+    qwen2_audio_7b_instruct = 'qwen2-audio-7b-instruct'
     # chatglm
     chatglm2_6b = 'chatglm2-6b'
     chatglm2_6b_32k = 'chatglm2-6b-32k'
@@ -515,6 +517,7 @@ class LoRATM(NamedTuple):
     # default lora target modules for multi-modals
     qwen_audio = f'{get_regex_for_mm_default_lora("qwen_audio")}'
     qwen_vl = f'{get_regex_for_mm_default_lora("qwen_vl")}'
+    qwen2_audio = f'{get_regex_for_mm_default_lora("qwen2_audio")}'
     glm4v = f'{get_regex_for_mm_default_lora("glm4v")}'
     llava_next_video = f'{get_regex_for_mm_default_lora("llava_next_video")}'
     llava_next = f'{get_regex_for_mm_default_lora("llava_next")}'
@@ -3472,6 +3475,38 @@ def get_model_tokenizer_qwen2_chat(model_dir: str,
     return get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs)
 
 
+@register_model(
+    ModelType.qwen2_audio_7b_instruct,
+    'qwen/Qwen2-Audio-7B-Instruct',
+    LoRATM.qwen2_audio,
+    TemplateType.qwen2_audio,
+    support_flash_attn=True,
+    requires=['librosa'],  # 'transformers>=4.45.0.dev0',
+    tags=['multi-modal', 'audio'],
+    hf_model_id='Qwen/Qwen2-Audio-7B-Instruct')
+@register_model(
+    ModelType.qwen2_audio_7b,
+    'qwen/Qwen2-Audio-7B',
+    LoRATM.qwen2_audio,
+    TemplateType.qwen2_audio_generation,
+    support_flash_attn=True,
+    requires=['librosa'],  # 'transformers>=4.45.0.dev0',
+    eos_token='<|endoftext|>',
+    tags=['multi-modal', 'audio'],
+    hf_model_id='Qwen/Qwen2-Audio-7B')
+def get_model_tokenizer_qwen2_audio(model_dir: str,
+                                    torch_dtype: Dtype,
+                                    model_kwargs: Dict[str, Any],
+                                    load_model: bool = True,
+                                    **kwargs):
+    from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+    processor = AutoProcessor.from_pretrained(model_dir)
+    kwargs['automodel_class'] = Qwen2AudioForConditionalGeneration
+    model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs)
+    tokenizer.processor = processor
+    return model, tokenizer
+
+
 @register_model(
     ModelType.qwen1half_0_5b_chat_int4,
     'qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 869fe5b52..9eae3c3ad 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
 from copy import deepcopy
+from functools import partial
 from types import MethodType
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypeVar, Union
 
@@ -37,6 +38,8 @@ class TemplateType:
     qwen = 'qwen'
     qwen_vl = 'qwen-vl'
     qwen_audio = 'qwen-audio'
+    qwen2_audio = 'qwen2-audio'
+    qwen2_audio_generation = 'qwen2-audio-generation'
     modelscope_agent = 'modelscope-agent'
     baichuan = 'baichuan'
     chatglm2 = 'chatglm2'
@@ -1118,6 +1121,62 @@ class QwenAudioGenerationTemplate(_QwenAudioTemplateMixin, DefaultGenerationTemp
 register_template(
     TemplateType.qwen_audio_generation, QwenAudioGenerationTemplate(), lazy_tokenize=True, is_generation=True)
 
+
+def _read_audio(audio_path, sampling_rate):
+    import librosa
+    from .vision_utils import _load_file
+    audio = _load_file(audio_path)
+    return librosa.load(audio, sr=sampling_rate)[0]
+
+
+class _Qwen2AudioTemplateMixin:
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        processor = self.tokenizer.processor
+        sampling_rate = processor.feature_extractor.sampling_rate
+        from .vision_utils import _read_batch
+        audios = _read_batch(example.get('audios') or [], load_func=partial(_read_audio, sampling_rate=sampling_rate))
+        if audios:
+            audio_inputs = processor.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_attention_mask=True, return_tensors='pt')
+            audio_inputs['feature_attention_mask'] = audio_inputs.pop('attention_mask')
+            inputs.update(audio_inputs)
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        if input_features:
+            res['input_features'] = torch.concat(input_features)
+            feature_attention_mask = [b['feature_attention_mask'] for b in batch]
+            res['feature_attention_mask'] = torch.concat(feature_attention_mask)
+        return res
+
+
+class Qwen2AudioTemplate(_Qwen2AudioTemplateMixin, QwenTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        return [f'Audio {index + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n']
+
+
+class Qwen2AudioGenerationTemplate(_Qwen2AudioTemplateMixin, DefaultGenerationTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        return ['<|audio_bos|><|AUDIO|><|audio_eos|>\n']
+
+
+register_template(TemplateType.qwen2_audio, Qwen2AudioTemplate(), lazy_tokenize=True)
+
+register_template(
+    TemplateType.qwen2_audio_generation, Qwen2AudioGenerationTemplate(), lazy_tokenize=True, is_generation=True)
+
 register_template(
     TemplateType.yi,
     Template([], ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'], ['<|im_end|>\n'], ['<|im_end|>'],
@@ -2350,9 +2409,9 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 
 def _load_video_cogvlm2(video_path: str) -> np.ndarray:
     from decord import cpu, VideoReader, bridge
-    from .vision_utils import _read_video
+    from .vision_utils import _load_file
     bridge.set_bridge('torch')
-    mp4_stream = _read_video(video_path)
+    mp4_stream = _load_file(video_path)
     clip_end_sec = 60
     clip_start_sec = 0
     num_frames = 24
@@ -2546,8 +2605,8 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 
 
 def _encode_video(video_path):
-    from .vision_utils import _read_video
-    mp4_stream = _read_video(video_path)
+    from .vision_utils import _load_file
+    mp4_stream = _load_file(video_path)
     MAX_NUM_FRAMES = 64
 
     from PIL import Image
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index 640bfc3c1..616c6176e 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -148,7 +148,7 @@ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     return frame_indices
 
 
-def _read_video(video_path: str) -> BytesIO:
+def _load_file(video_path: str) -> BytesIO:
     video_path = video_path.strip()
     if video_path.startswith('http'):
         content = requests.get(video_path).content
@@ -162,7 +162,7 @@ def _read_video(video_path: str) -> BytesIO:
 def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
     from decord import VideoReader, cpu
     from PIL import Image
-    vr = VideoReader(_read_video(video_path), ctx=cpu(0), num_threads=1)
+    vr = VideoReader(_load_file(video_path), ctx=cpu(0), num_threads=1)
     max_frame = len(vr) - 1
     fps = float(vr.get_avg_fps())
 
diff --git a/tests/custom/test_megatron.py b/tests/custom/test_megatron.py
index cb99413dc..ba57eddd6 100644
--- a/tests/custom/test_megatron.py
+++ b/tests/custom/test_megatron.py
@@ -13,7 +13,7 @@ def convert2megatron():
 
 def convert2hf():
     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-    export_main(ExportArguments(ckpt_dir=f'{model_type}-tp{tp}-pp1', to_hf=True, tp=tp, dtype='bf16'))
+    export_main(ExportArguments(ckpt_dir=f'{model_type}-tp{tp}-pp1', to_hf=True, dtype='bf16'))
 
 
 def sft():