Skip to content

Commit

Permalink
Merge pull request modelscope#3 from modelscope/feat/support_qwen_sft
Browse files Browse the repository at this point in the history
support qwen sft
  • Loading branch information
wenmengzhou committed Aug 3, 2023
2 parents 3af7a8c + 48eb386 commit 90ecc71
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 19 deletions.
5 changes: 2 additions & 3 deletions examples/pytorch/llm/llm_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@
@dataclass
class InferArguments:
model_type: str = field(
default='baichuan-7b',
metadata={'choices': list(MODEL_MAPPING.keys())})
default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
sft_type: str = field(
default='lora', metadata={'choices': ['lora', 'full']})
ckpt_dir: str = '/path/to/your/vx_xxx/checkpoint-xxx'
Expand All @@ -37,7 +36,7 @@ class InferArguments:
default='alpaca-en,alpaca-zh',
metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
dataset_seed: int = 42
dataset_sample: Optional[int] = None
dataset_sample: Optional[int] = 20000
dataset_test_size: float = 0.01
prompt: str = DEFAULT_PROMPT
max_length: Optional[int] = 2048
Expand Down
22 changes: 11 additions & 11 deletions examples/pytorch/llm/llm_sft.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# ### Setting up experimental environment.
"""
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
pip install sentencepiece charset_normalizer cpm_kernels tiktoken -U
pip install matplotlib scikit-learn -U
pip install transformers datasets -U
pip install tqdm tensorboard torchmetrics -U
pip install accelerate transformers_stream_generator -U
# Install the latest version of swift from source
git clone https://github.com/modelscope/swift.git
cd swift
Expand All @@ -8,16 +15,10 @@
# Install the latest version of modelscope from source
git clone https://github.com/modelscope/modelscope.git
cd modelscope
pip install -r requirements.txt
pip install .
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
pip install numpy pandas -U # Resolve torchmetrics dependencies and update numpy
pip install matplotlib scikit-learn -U
pip install transformers datasets -U
pip install tqdm tensorboard torchmetrics -U
pip install sentencepiece charset_normalizer cpm_kernels -U
pip install accelerate transformers_stream_generator -U
"""

import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
from dataclasses import dataclass, field
Expand All @@ -44,8 +45,7 @@
@dataclass
class SftArguments:
model_type: str = field(
default='baichuan-7b',
metadata={'choices': list(MODEL_MAPPING.keys())})
default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
# baichuan-7b: 'lora': 16G; 'full': 80G
sft_type: str = field(
default='lora', metadata={'choices': ['lora', 'full']})
Expand All @@ -61,7 +61,7 @@ class SftArguments:
default='alpaca-en,alpaca-zh',
metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
dataset_seed: int = 42
dataset_sample: Optional[int] = None
dataset_sample: Optional[int] = 20000
dataset_test_size: float = 0.01
prompt: str = DEFAULT_PROMPT
max_length: Optional[int] = 2048
Expand Down
7 changes: 4 additions & 3 deletions examples/pytorch/llm/run_infer.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
CUDA_VISIBLE_DEVICES=0,1 \
python llm_infer.py \
--model_type openbuddy-llama2-13b \
--ckpt_dir "runs/openbuddy-llama2-13b/vx_xxx/checkpoint-xxx" \
--eval_human true
--model_type qwen-7b \
--ckpt_dir "qwen-7b/vx_xxx/checkpoint-xxx" \
--eval_human true \
--dataset_sample 20000
2 changes: 1 addition & 1 deletion examples/pytorch/llm/run_sft.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CUDA_VISIBLE_DEVICES=0,1 \
python llm_sft.py \
--model_type openbuddy-llama2-13b \
--model_type qwen-7b \
--dataset alpaca-en,alpaca-zh \
--dataset_sample 20000
2 changes: 1 addition & 1 deletion examples/pytorch/llm/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def plot_images(images_dir: str,
values_s = tensorboard_smoothing(values, smooth_val)
ax.plot(steps, values_s, color=TB_COLOR_SMOOTH)
else:
ax.plot(steps, values, color=TB_COLOR)
ax.plot(steps, values, color=TB_COLOR_SMOOTH)
fpath = os.path.join(images_dir, k.replace('/', '_'))
plt.savefig(fpath, dpi=dpi, bbox_inches='tight')

Expand Down
7 changes: 7 additions & 0 deletions examples/pytorch/llm/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ def get_alpaca_zh_dataset() -> HfDataset:
return _processing_alpaca(dataset_zh)


def get_finance_en_dataset() -> HfDataset:
finance_en: HfDataset = MsDataset.load(
'wyj123456/finance_en', split='train').to_hf_dataset()
return _processing_alpaca(finance_en)


def process_dataset(dataset: HfDataset, dataset_test_size: float,
dataset_sample: Optional[int],
dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
Expand All @@ -53,6 +59,7 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float,
DATASET_MAPPING = {
'alpaca-en': get_alpaca_en_dataset,
'alpaca-zh': get_alpaca_zh_dataset,
'finance-en': get_finance_en_dataset,
}


Expand Down
29 changes: 29 additions & 0 deletions examples/pytorch/llm/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from modelscope import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, Model,
get_logger, read_config, snapshot_download)
from modelscope.models.nlp.chatglm2 import ChatGLM2Config, ChatGLM2Tokenizer
from modelscope.models.nlp.qwen import QWenConfig, QWenTokenizer
from torch import dtype as Dtype

logger = get_logger()
Expand Down Expand Up @@ -61,11 +62,32 @@ def get_model_tokenizer_chatglm2(model_dir: str,
return model, tokenizer


def get_model_tokenizer_qwen(model_dir: str,
torch_dtype: Dtype,
load_model: bool = True):
config = read_config(model_dir)
logger.info(config)
model_config = QWenConfig.from_pretrained(model_dir)
model_config.torch_dtype = torch_dtype
logger.info(model_config)
tokenizer = QWenTokenizer.from_pretrained(model_dir)
model = None
if load_model:
model = Model.from_pretrained(
model_dir,
cfg_dict=config,
config=model_config,
device_map='auto',
torch_dtype=torch_dtype)
return model, tokenizer


class LoRATM(NamedTuple):
# default lora target modules
baichuan = ['W_pack']
chatglm2 = ['query_key_value']
llama2 = ['q_proj', 'k_proj', 'v_proj']
qwen = ['c_attn']


# Reference: 'https://modelscope.cn/models/{model_id}/summary'
Expand Down Expand Up @@ -105,6 +127,13 @@ class LoRATM(NamedTuple):
'model_id': 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
'lora_TM': LoRATM.llama2,
'revision': 'v1.0.0',
},
'qwen-7b': {
'model_id': 'qwen/Qwen-7B',
'revision': 'v1.0.0',
'get_function': get_model_tokenizer_qwen,
'torch_dtype': torch.bfloat16,
'lora_TM': LoRATM.qwen,
}
}

Expand Down

0 comments on commit 90ecc71

Please sign in to comment.