fix bugs (modelscope#1464)

montanafang · Jul 22, 2024 · 4884edf · 4884edf
1 parent 6022195
commit 4884edf
Show file tree

Hide file tree

Showing 15 changed files with 122 additions and 140 deletions.
diff --git a/README.md b/README.md
@@ -405,7 +405,7 @@ swift sft \
 
 #### Multi-node Multi-GPU
 ```shell
-# If multiple machines share a disk, please additionally specify `--save_on_each_node false`.
+# If the disk is not shared, please additionally specify `--save_on_each_node true` in the shell scripts on each machine.
 # node0
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 NNODES=2 \

diff --git a/README_CN.md b/README_CN.md
@@ -402,7 +402,7 @@ swift sft \
 
 #### 多机多卡
 ```shell
-# 如果多机共用磁盘请在各机器sh中额外指定`--save_on_each_node false`.
+# 如果非共用磁盘请在各机器sh中额外指定`--save_on_each_node true`.
 # node0
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 NNODES=2 \

diff --git a/docs/source/LLM/LLM微调文档.md b/docs/source/LLM/LLM微调文档.md
@@ -100,7 +100,7 @@ swift sft \
     --output_dir output \
 
 # 多机多卡
-# 如果多机共用磁盘请在各机器sh中额外指定`--save_on_each_node false`.
+# 如果非共用磁盘请在各机器sh中额外指定`--save_on_each_node true`.
 # node0
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NNODES=2 \

diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -111,7 +111,7 @@
 - `--logging_dir`: 默认为`None`. 即设置为`f'{self.output_dir}/runs'`, 表示tensorboard文件存储路径.
 - `--report_to`: 默认为`['tensorboard']`. 可以设置`--report_to all`来报告所有已安装的集成.
 - `--acc_strategy`: 默认为`'token'`, 可选择的值包括: 'token', 'sentence'.
-- `--save_on_each_node`: 该参数在多机训练时生效, 默认为`True`.
+- `--save_on_each_node`: 该参数在多机训练时生效, 默认为`False`.
 - `--save_strategy`: 保存checkpoint的策略, 默认为`'steps'`, 可选择的值包括: 'steps', 'epoch', 'no'.
 - `--evaluation_strategy`: 交叉验证策略, 默认为`'steps'`, 可选择的值包括: 'steps', 'epoch', 'no'.
 - `--save_safetensors`: 默认为`True`.

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -20,6 +20,7 @@ Swift DOCUMENTATION
    :maxdepth: 2
    :caption: LLM Training and Inference
 
+   LLM/index.md
    LLM/LLM推理文档.md
    LLM/LLM微调文档.md
    LLM/人类偏好对齐训练文档.md
@@ -47,6 +48,7 @@ Swift DOCUMENTATION
    :maxdepth: 2
    :caption: Multi-Modal LLM Training and Inference
 
+   Multi-Modal/index.md
    Multi-Modal/qwen-vl最佳实践.md
    Multi-Modal/qwen-audio最佳实践.md
    Multi-Modal/deepseek-vl最佳实践.md

diff --git a/docs/source_en/LLM/Command-line-parameters.md b/docs/source_en/LLM/Command-line-parameters.md
@@ -112,7 +112,7 @@
 - `--logging_dir`: Default is `None`. I.e. set to `f'{self.output_dir}/runs'`, representing path to store tensorboard files.
 - `--report_to`: Default is `['tensorboard']`. You can set `--report_to all` to report to all installed integrations.
 - `--acc_strategy`: Default is `'token'`, options include: 'token', 'sentence'.
-- `--save_on_each_node`: Takes effect during multi-machine training, default is `True`.
+- `--save_on_each_node`: Takes effect during multi-machine training, default is `False`.
 - `--save_strategy`: Strategy for saving checkpoint, default is `'steps'`, options include: 'steps', 'epoch', no'.
 - `--evaluation_strategy`: Strategy for evaluation, default is `'steps'`, options include: 'steps', 'epoch', no'.
 - `--save_safetensors`: Default is `True`.
@@ -128,7 +128,7 @@
 - `--train_dataset_mix_ratio`: Default is `0.`. This parameter defines how to mix datasets for training. When this parameter is specified, it will mix the training dataset with a multiple of `train_dataset_mix_ratio` of the general knowledge dataset specified by `train_dataset_mix_ds`. This parameter has been deprecated, please use `--dataset {dataset_name}#{dataset_sample}` to mix datasets.
 - `--train_dataset_mix_ds`: Default is `['ms-bench']`. Used for preventing knowledge forgetting, this is the general knowledge dataset. This parameter has been deprecated, please use `--dataset {dataset_name}#{dataset_sample}` to mix datasets.
 - `--use_loss_scale`: Default is `False`. When taking effect, strengthens loss weight of some Agent fields (Action/Action Input part) to enhance CoT, has no effect in regular SFT scenarios.
-- `loss_scale_config_path`: option specifies a custom loss_scale configuration, applicable when use_loss_scale is enabled, such as in Agent training to amplify the loss weights for Action and other crucial ReAct fields.
+- `--loss_scale_config_path`: option specifies a custom loss_scale configuration, applicable when use_loss_scale is enabled, such as in Agent training to amplify the loss weights for Action and other crucial ReAct fields.
   - In the configuration file, you can set the loss_scale using a dictionary format. Each key represents a specific field name, and its associated value specifies the loss scaling factor for that field and its subsequent content. For instance, setting `"Observation:": [2, 0]` means that when the response contains `xxxx Observation:error`, the loss for the `Observation:` field will be doubled, while the loss for the `error` portion will not be counted. Besides literal matching, the configuration also supports regular expression rules for more flexible matching; for example, the pattern `'<.*?>':[2.0]` doubles the loss for any content enclosed in angle brackets. The loss scaling factors for field matching and regex matching are respectively indicated by lists of length 2 and 1.
   - There is also support for setting loss_scale for the entire response based on matching queries, which is extremely useful in dealing with fixed multi-turn dialogue queries described in the [Agent-Flan paper](https://arxiv.org/abs/2403.12881) paper. If the query includes any of the predefined keys, the corresponding response will use the associated loss_scale value. Refer to swift/llm/agent/agentflan.json for an example.
   - By default, we have preset loss scaling values for fields such as Action:, Action Input:, Thought:, Final Answer:, and Observation:. We also provide default configurations for [alpha-umi](https://arxiv.org/pdf/2401.07324) and [Agent-FLAN](https://arxiv.org/abs/2403.12881), which you can use by setting to alpha-umi and agent-flan respectively. The default configuration files are located under swift/llm/agent.

diff --git a/docs/source_en/LLM/LLM-fine-tuning.md b/docs/source_en/LLM/LLM-fine-tuning.md
@@ -96,7 +96,7 @@ swift sft \
     --output_dir output \
 
 # Multi-machine multi-card
-# If multiple machines share a disk, please additionally specify `--save_on_each_node false`.
+# If the disk is not shared, please additionally specify `--save_on_each_node true` in the shell scripts on each machine.
 # node0
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NNODES=2 \

diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
@@ -20,6 +20,7 @@ Swift DOCUMENTATION
    :maxdepth: 2
    :caption: LLM Training and Inference
 
+   LLM/index.md
    LLM/LLM-inference.md
    LLM/LLM-fine-tuning.md
    LLM/Human-Preference-Alignment-Training-Documentation.md
@@ -48,6 +49,7 @@ Swift DOCUMENTATION
    :maxdepth: 2
    :caption: Multi-Modal LLM Training and Inference
 
+   Multi-Modal/index.md
    Multi-Modal/qwen-vl-best-practice.md
    Multi-Modal/qwen-audio-best-practice.md
    Multi-Modal/deepseek-vl-best-practice.md

diff --git a/requirements/framework.txt b/requirements/framework.txt
@@ -5,6 +5,7 @@ binpacking
 dacite
 datasets<2.19
 einops
+huggingface_hub<0.24
 importlib_metadata
 jieba
 matplotlib

diff --git a/swift/llm/eval.py b/swift/llm/eval.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import asyncio
 import datetime as dt
-import multiprocessing as mp
+import multiprocessing
 import os
 import time
 from typing import Any, Dict, List, Optional, Tuple
@@ -22,7 +22,6 @@
 from .utils import DeployArguments, EvalArguments, XRequestConfig, inference, inference_client_async
 
 logger = get_logger()
-mp.set_start_method('spawn', force=True)
 
 
 class EvalModel(CustomModel):
@@ -202,6 +201,7 @@ def eval_opencompass(args: EvalArguments) -> List[Dict[str, Any]]:
         seed_everything(args.seed)
         port = _find_free_port()
         args.port = port
+        mp = multiprocessing.get_context('spawn')
         process = mp.Process(target=run_custom_model, args=(args, ))
         process.start()
 

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
@@ -326,7 +326,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
         history = []
         infer_kwargs = {}
         if args.infer_media_type != 'none':
-            logger.info('Please enter the conversation content first, ' 'followed by the path to the multimedia file.')
+            logger.info('Please enter the conversation content first, followed by the path to the multimedia file.')
         system = None
         read_system = False
         while True:

diff --git a/swift/llm/rlhf.py b/swift/llm/rlhf.py
@@ -12,6 +12,7 @@
 from swift.trainers import RLHFTrainerFactory
 from swift.utils import (check_json_format, get_dist_setting, get_logger, get_main, get_model_info, is_ddp_plus_mp,
                          is_dist, is_master, plot_images, seed_everything, show_layers)
+from .sft import _get_train_val_dataset
 from .tuner import prepare_model
 from .utils import (TEMPLATE_MAPPING, RLHFArguments, Template, get_dataset, get_model_tokenizer, get_template,
                     get_time_info, set_generation_config)
@@ -166,31 +167,10 @@ def llm_rlhf(args: RLHFArguments) -> Dict[str, Any]:
     if hasattr(model, 'hf_device_map'):
         logger.info(f'model device_map {model.hf_device_map}')
 
-    # Loading Dataset
-    train_dataset, val_dataset = get_dataset(
-        args.dataset,
-        args.dataset_test_ratio,
-        args.dataset_seed,
-        check_dataset_strategy=args.check_dataset_strategy,
-        model_name=args.model_name,
-        model_author=args.model_author)
-
-    if len(args.val_dataset) > 0:
-        # Loading val dataset
-        _, val_dataset = get_dataset(
-            args.val_dataset,
-            1.0,
-            args.dataset_seed,
-            check_dataset_strategy=args.check_dataset_strategy,
-            model_name=args.model_name,
-            model_author=args.model_author)
-
-    train_dataset, val_dataset = args._handle_dataset_compat(train_dataset, val_dataset)
+    train_dataset, val_dataset = _get_train_val_dataset(args)
     if val_dataset is None:
         training_args.evaluation_strategy = IntervalStrategy.NO
         training_args.do_eval = False
-    logger.info(f'train_dataset: {train_dataset}')
-    logger.info(f'val_dataset: {val_dataset}')
 
     template_kwargs = {}
     template_info = TEMPLATE_MAPPING[args.template_type]

diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from functools import partial
-from typing import Any, Dict, Union
+from typing import Any, Dict, Optional, Tuple
 
 import json
 import torch
+from datasets import Dataset as HfDataset
 from modelscope import BitsAndBytesConfig, GenerationConfig
 from transformers import IntervalStrategy
 from transformers.integrations import is_deepspeed_zero3_enabled
@@ -13,9 +14,9 @@
 from swift.torchacc_utils import patch_acc_model
 from swift.trainers import Seq2SeqTrainer
 from swift.trainers.utils import can_return_loss, find_labels
-from swift.utils import (append_to_jsonl, check_json_format, compute_acc_metrics, compute_nlg_metrics, get_dist_setting,
-                         get_logger, get_main, get_model_info, is_ddp_plus_mp, is_dist, is_local_master, is_master,
-                         plot_images, preprocess_logits_for_metrics, seed_everything, show_layers, use_torchacc)
+from swift.utils import (append_to_jsonl, check_json_format, compute_acc_metrics, compute_nlg_metrics, get_logger,
+                         get_main, get_model_info, is_ddp_plus_mp, is_dist, is_local_master, is_master, plot_images,
+                         preprocess_logits_for_metrics, seed_everything, show_layers, use_torchacc)
 from .accelerator import ta_accelerate
 from .tuner import prepare_model
 from .utils import (LazyLLMDataset, SftArguments, Template, dataset_map, get_dataset, get_model_tokenizer, get_template,
@@ -24,17 +25,41 @@
 logger = get_logger()
 
 
-def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
+def _get_train_val_dataset(args: SftArguments) -> Tuple[HfDataset, Optional[HfDataset]]:
+    # Loading Dataset
+    train_dataset, val_dataset = get_dataset(
+        args.dataset,
+        args.dataset_test_ratio,
+        args.dataset_seed,
+        check_dataset_strategy=args.check_dataset_strategy,
+        model_name=args.model_name,
+        model_author=args.model_author)
+    if len(args.val_dataset) > 0:
+        # Loading val dataset
+        _, val_dataset = get_dataset(
+            args.val_dataset,
+            1.0,
+            args.dataset_seed,
+            check_dataset_strategy=args.check_dataset_strategy,
+            model_name=args.model_name,
+            model_author=args.model_author)
+
+    train_dataset, val_dataset = args._handle_dataset_compat(train_dataset, val_dataset)
+    logger.info(f'train_dataset: {train_dataset}')
+    logger.info(f'val_dataset: {val_dataset}')
+    return train_dataset, val_dataset
+
 
+def llm_sft(args: SftArguments) -> Dict[str, Any]:
     logger.info(f'args: {args}')
     seed_everything(args.seed)
     training_args = args.training_args
     if is_torch_npu_available():
         print(f'device_count: {torch.npu.device_count()}')
     else:
         print(f'device_count: {torch.cuda.device_count()}')
-    rank, local_rank, world_size, local_world_size = get_dist_setting()
-    print(f'rank: {rank}, local_rank: {local_rank}, ' f'world_size: {world_size}, local_world_size: {local_world_size}')
+    print(f'rank: {args.rank}, local_rank: {args.local_rank}, '
+          f'world_size: {args.world_size}, local_world_size: {args.local_world_size}')
 
     if args.gpu_memory_fraction is not None:
         for device_id in range(torch.cuda.device_count()):
@@ -44,7 +69,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     if is_deepspeed_zero3_enabled() or os.environ.get('ACCELERATE_USE_FSDP', 'False') == 'true':
         model_kwargs = {'device_map': None}
     elif is_torch_npu_available():
-        model_kwargs = {'device_map': local_rank if local_rank >= 0 else 0}
+        model_kwargs = {'device_map': args.local_rank if args.local_rank >= 0 else 0}
     elif args.device_map_config_path is not None:
         cwd = os.getcwd()
         config_path = args.device_map_config_path if os.path.isabs(args.device_map_config_path) else os.path.join(
@@ -54,18 +79,18 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     else:
         model_kwargs = {'low_cpu_mem_usage': True}
         if is_dist() and not is_ddp_plus_mp():
-            model_kwargs['device_map'] = {'': local_rank}
+            model_kwargs['device_map'] = {'': args.local_rank}
         elif torch.cuda.device_count() == 1:
             model_kwargs['device_map'] = 'cuda:0'
         elif not use_torchacc():
             model_kwargs['device_map'] = 'auto'
 
     if args.device_max_memory:
         n_gpu = torch.cuda.device_count()
-        assert len(args.device_max_memory) == n_gpu // local_world_size
+        assert len(args.device_max_memory) == n_gpu // args.local_world_size
         model_kwargs['max_memory'] = {
             i: mem
-            for i, mem in zip(list(range(max(local_rank, 0), n_gpu, local_world_size)), args.device_max_memory)
+            for i, mem in zip(range(max(args.local_rank, 0), n_gpu, args.local_world_size), args.device_max_memory)
         }
 
     if args.quant_method == 'hqq':
@@ -174,28 +199,8 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
             gradient_checkpointing=True,
             fsdp_flatten_parameters=False)
 
-    # Loading Dataset
-    train_dataset, val_dataset = get_dataset(
-        args.dataset,
-        args.dataset_test_ratio,
-        args.dataset_seed,
-        check_dataset_strategy=args.check_dataset_strategy,
-        model_name=args.model_name,
-        model_author=args.model_author)
-    if len(args.val_dataset) > 0:
-        # Loading val dataset
-        _, val_dataset = get_dataset(
-            args.val_dataset,
-            1.0,
-            args.dataset_seed,
-            check_dataset_strategy=args.check_dataset_strategy,
-            model_name=args.model_name,
-            model_author=args.model_author)
-
-    train_dataset, val_dataset = args._handle_dataset_compat(train_dataset, val_dataset)
-    training_args.train_dataset_sample = train_dataset.shape[0] if train_dataset is not None else 0
-    logger.info(f'train_dataset: {train_dataset}')
-    logger.info(f'val_dataset: {val_dataset}')
+    train_dataset, val_dataset = _get_train_val_dataset(args)
+    training_args.train_dataset_sample = train_dataset.shape[0] if train_dataset is not None else 0  # torchacc
     template_kwargs = {}
     template_kwargs['use_loss_scale'] = args.use_loss_scale
     if args.loss_scale_config_path is not None:
@@ -271,8 +276,8 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     train_batch_size = args.batch_size
     eval_batch_size = args.eval_batch_size
     if use_torchacc():
-        train_batch_size *= world_size
-        eval_batch_size *= world_size
+        train_batch_size *= args.world_size
+        eval_batch_size *= args.world_size
         training_args.per_device_train_batch_size = train_batch_size
         training_args.per_device_eval_batch_size = eval_batch_size
         training_args.group_by_length = use_torchacc()
@@ -354,7 +359,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
 
 def get_sft_main(args, llm):
     if use_torchacc():
-        logger.warning('TorchAcc is currently only available internally ' 'within Alibaba Cloud.')
+        logger.warning('TorchAcc is currently only available internally within Alibaba Cloud.')
         import torchacc as ta
         # This patch should be called before `llm_sft`.
         ta.accelerate_hf_trainer()