Merge pull request modelscope#3 from modelscope/feat/support_qwen_sft

support qwen sft
zhangxiuyao1 · Aug 3, 2023 · 90ecc71 · 90ecc71
2 parents 3af7a8c + 48eb386
commit 90ecc71
Show file tree

Hide file tree

Showing 7 changed files with 55 additions and 19 deletions.
diff --git a/examples/pytorch/llm/llm_infer.py b/examples/pytorch/llm/llm_infer.py
@@ -21,8 +21,7 @@
 @dataclass
 class InferArguments:
     model_type: str = field(
-        default='baichuan-7b',
-        metadata={'choices': list(MODEL_MAPPING.keys())})
+        default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
     sft_type: str = field(
         default='lora', metadata={'choices': ['lora', 'full']})
     ckpt_dir: str = '/path/to/your/vx_xxx/checkpoint-xxx'
@@ -37,7 +36,7 @@ class InferArguments:
         default='alpaca-en,alpaca-zh',
         metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
     dataset_seed: int = 42
-    dataset_sample: Optional[int] = None
+    dataset_sample: Optional[int] = 20000
     dataset_test_size: float = 0.01
     prompt: str = DEFAULT_PROMPT
     max_length: Optional[int] = 2048

diff --git a/examples/pytorch/llm/llm_sft.py b/examples/pytorch/llm/llm_sft.py
@@ -1,5 +1,12 @@
 # ### Setting up experimental environment.
 """
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
+pip install sentencepiece charset_normalizer cpm_kernels tiktoken -U
+pip install matplotlib scikit-learn -U
+pip install transformers datasets -U
+pip install tqdm tensorboard torchmetrics -U
+pip install accelerate transformers_stream_generator -U
+
 # Install the latest version of swift from source
 git clone https://github.com/modelscope/swift.git
 cd swift
@@ -8,16 +15,10 @@
 # Install the latest version of modelscope from source
 git clone https://github.com/modelscope/modelscope.git
 cd modelscope
+pip install -r requirements.txt
 pip install .
-
-conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
-pip install numpy pandas -U  # Resolve torchmetrics dependencies and update numpy
-pip install matplotlib scikit-learn -U
-pip install transformers datasets -U
-pip install tqdm tensorboard torchmetrics -U
-pip install sentencepiece charset_normalizer cpm_kernels -U
-pip install accelerate transformers_stream_generator -U
 """
+
 import os
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
 from dataclasses import dataclass, field
@@ -44,8 +45,7 @@
 @dataclass
 class SftArguments:
     model_type: str = field(
-        default='baichuan-7b',
-        metadata={'choices': list(MODEL_MAPPING.keys())})
+        default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
     # baichuan-7b: 'lora': 16G; 'full': 80G
     sft_type: str = field(
         default='lora', metadata={'choices': ['lora', 'full']})
@@ -61,7 +61,7 @@ class SftArguments:
         default='alpaca-en,alpaca-zh',
         metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
     dataset_seed: int = 42
-    dataset_sample: Optional[int] = None
+    dataset_sample: Optional[int] = 20000
     dataset_test_size: float = 0.01
     prompt: str = DEFAULT_PROMPT
     max_length: Optional[int] = 2048

diff --git a/examples/pytorch/llm/run_infer.sh b/examples/pytorch/llm/run_infer.sh
@@ -1,5 +1,6 @@
 CUDA_VISIBLE_DEVICES=0,1 \
 python llm_infer.py \
-    --model_type openbuddy-llama2-13b \
-    --ckpt_dir "runs/openbuddy-llama2-13b/vx_xxx/checkpoint-xxx" \
-    --eval_human true
+    --model_type qwen-7b \
+    --ckpt_dir "qwen-7b/vx_xxx/checkpoint-xxx" \
+    --eval_human true \
+    --dataset_sample 20000
diff --git a/examples/pytorch/llm/run_sft.sh b/examples/pytorch/llm/run_sft.sh
@@ -1,5 +1,5 @@
 CUDA_VISIBLE_DEVICES=0,1 \
 python llm_sft.py \
-    --model_type openbuddy-llama2-13b \
+    --model_type qwen-7b \
     --dataset alpaca-en,alpaca-zh \
     --dataset_sample 20000
diff --git a/examples/pytorch/llm/utils/__init__.py b/examples/pytorch/llm/utils/__init__.py
@@ -60,7 +60,7 @@ def plot_images(images_dir: str,
             values_s = tensorboard_smoothing(values, smooth_val)
             ax.plot(steps, values_s, color=TB_COLOR_SMOOTH)
         else:
-            ax.plot(steps, values, color=TB_COLOR)
+            ax.plot(steps, values, color=TB_COLOR_SMOOTH)
         fpath = os.path.join(images_dir, k.replace('/', '_'))
         plt.savefig(fpath, dpi=dpi, bbox_inches='tight')
 

diff --git a/examples/pytorch/llm/utils/dataset.py b/examples/pytorch/llm/utils/dataset.py
@@ -38,6 +38,12 @@ def get_alpaca_zh_dataset() -> HfDataset:
     return _processing_alpaca(dataset_zh)
 
 
+def get_finance_en_dataset() -> HfDataset:
+    finance_en: HfDataset = MsDataset.load(
+        'wyj123456/finance_en', split='train').to_hf_dataset()
+    return _processing_alpaca(finance_en)
+
+
 def process_dataset(dataset: HfDataset, dataset_test_size: float,
                     dataset_sample: Optional[int],
                     dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
@@ -53,6 +59,7 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float,
 DATASET_MAPPING = {
     'alpaca-en': get_alpaca_en_dataset,
     'alpaca-zh': get_alpaca_zh_dataset,
+    'finance-en': get_finance_en_dataset,
 }
 
 

diff --git a/examples/pytorch/llm/utils/models.py b/examples/pytorch/llm/utils/models.py
@@ -5,6 +5,7 @@
 from modelscope import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, Model,
                         get_logger, read_config, snapshot_download)
 from modelscope.models.nlp.chatglm2 import ChatGLM2Config, ChatGLM2Tokenizer
+from modelscope.models.nlp.qwen import QWenConfig, QWenTokenizer
 from torch import dtype as Dtype
 
 logger = get_logger()
@@ -61,11 +62,32 @@ def get_model_tokenizer_chatglm2(model_dir: str,
     return model, tokenizer
 
 
+def get_model_tokenizer_qwen(model_dir: str,
+                             torch_dtype: Dtype,
+                             load_model: bool = True):
+    config = read_config(model_dir)
+    logger.info(config)
+    model_config = QWenConfig.from_pretrained(model_dir)
+    model_config.torch_dtype = torch_dtype
+    logger.info(model_config)
+    tokenizer = QWenTokenizer.from_pretrained(model_dir)
+    model = None
+    if load_model:
+        model = Model.from_pretrained(
+            model_dir,
+            cfg_dict=config,
+            config=model_config,
+            device_map='auto',
+            torch_dtype=torch_dtype)
+    return model, tokenizer
+
+
 class LoRATM(NamedTuple):
     # default lora target modules
     baichuan = ['W_pack']
     chatglm2 = ['query_key_value']
     llama2 = ['q_proj', 'k_proj', 'v_proj']
+    qwen = ['c_attn']
 
 
 # Reference: 'https://modelscope.cn/models/{model_id}/summary'
@@ -105,6 +127,13 @@ class LoRATM(NamedTuple):
         'model_id': 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
         'lora_TM': LoRATM.llama2,
         'revision': 'v1.0.0',
+    },
+    'qwen-7b': {
+        'model_id': 'qwen/Qwen-7B',
+        'revision': 'v1.0.0',
+        'get_function': get_model_tokenizer_qwen,
+        'torch_dtype': torch.bfloat16,
+        'lora_TM': LoRATM.qwen,
     }
 }