Skip to content

Commit

Permalink
Support faster data map (modelscope#1871)
Browse files Browse the repository at this point in the history
  • Loading branch information
tastelikefeet committed Aug 30, 2024
1 parent 6aa4da1 commit 0c83298
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 3 deletions.
2 changes: 1 addition & 1 deletion swift/llm/agent/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

logger = get_logger()

REACT_PROMPT = """Answer the following questions as best you can. You have access to the following tools:
REACT_PROMPT = """Answer the following questions as best as you can. You have access to the following tools:
{tool_list}
Expand Down
2 changes: 2 additions & 0 deletions swift/llm/utils/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,8 @@ def _prepare_modules_to_save(self, modules_to_save) -> List[str]:
def __post_init__(self) -> None:
super().__post_init__()
self.handle_compatibility()
if self.preprocess_num_proc and self.preprocess_num_proc > 1:
os.environ['DATASET_MAP_NPROC'] = str(self.preprocess_num_proc)
if len(self.val_dataset) > 0:
self.dataset_test_ratio = 0.0
logger.info('Using val_dataset, ignoring dataset_test_ratio')
Expand Down
11 changes: 11 additions & 0 deletions swift/llm/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ def _update_fingerprint_mac(*args, **kwargs):
datasets.fingerprint.update_fingerprint = _update_fingerprint_mac
datasets.arrow_dataset.update_fingerprint = _update_fingerprint_mac


def partialed_map(self, *args, **kwargs):
if 'num_proc' not in kwargs:
num_proc = os.environ.get('DATASET_MAP_NPROC')
kwargs['num_proc'] = int(num_proc) if num_proc else num_proc
return self._origin_map(*args, **kwargs)


datasets.Dataset._origin_map = datasets.Dataset.map
datasets.Dataset.map = partialed_map

standard_keys = {
'query', 'query_role', 'response', 'rejected_response', 'system', 'history', 'history_roles', 'images', 'objects',
'videos', 'audios', 'tools', 'label'
Expand Down
2 changes: 0 additions & 2 deletions swift/llm/utils/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ def __call__(self, dataset: DATASET_TYPE) -> DATASET_TYPE:
return dataset


@_reduce_columns
class AlpacaPreprocessor(MediaMixin, RowPreprocessMixin):

def __init__(self, concat_inst_inp: Optional[Callable[[str, str], str]] = None, **kwargs):
Expand Down Expand Up @@ -195,7 +194,6 @@ def _default_repair_conversations(s: Union[str, Any]) -> Any:
return s


@_reduce_columns
class ConversationsPreprocessor(MediaMixin, RowPreprocessMixin):

def __init__(self,
Expand Down

0 comments on commit 0c83298

Please sign in to comment.