From 3f00547fb9bbe9779e268e52e65f85b0374b5e97 Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 23 Feb 2024 14:29:40 +0800 Subject: [PATCH] Fix some typos (dst_strategys, etc.) (#62003) --- .../auto_parallel/static/auto_align_tool.py | 12 ++++++------ .../auto_parallel/static/completion.py | 2 +- .../auto_parallel/static/converter.py | 6 +++--- .../auto_parallel/static/cost/base_cost.py | 2 +- .../auto_parallel/static/cost_model.py | 10 +++++----- .../distributed/auto_parallel/static/helper.py | 4 ++-- .../auto_parallel/static/parallelizer_v2.py | 2 +- .../fleet/base/distributed_strategy.py | 2 +- python/paddle/distributed/fleet/fleet.py | 4 ++-- python/paddle/distributed/fleet/launch.py | 2 +- .../paddle/distributed/fleet/launch_utils.py | 14 +++++++------- .../hybrid_parallel_gradscaler.py | 2 +- .../fleet/meta_optimizers/sharding/utils.py | 2 +- .../sharding/weight_decay_helper.py | 2 +- .../meta_optimizers/sharding_optimizer.py | 18 +++++++++--------- .../fleet/meta_parallel/pipeline_parallel.py | 4 ++-- .../sharding/group_sharded_utils.py | 2 +- .../distributed/fleet/recompute/recompute.py | 16 ++++++++-------- .../fleet/recompute/recompute_hybrid.py | 4 ++-- .../fleet/runtime/parameter_server_runtime.py | 14 +++++++------- .../distributed/fleet/runtime/the_one_ps.py | 8 ++++---- python/paddle/distributed/fleet/scaler.py | 2 +- python/paddle/distributed/fleet/utils/fs.py | 8 ++++---- .../fleet/utils/hybrid_parallel_inference.py | 4 ++-- .../fleet/utils/hybrid_parallel_util.py | 4 ++-- .../fleet/utils/mix_precision_utils.py | 2 +- .../fleet/utils/tensor_fusion_helper.py | 2 +- .../fleet/utils/tensor_parallel_utils.py | 8 ++++---- .../distributed/launch/context/__init__.py | 4 ++-- .../launch/controllers/controller.py | 2 +- python/paddle/distributed/models/moe/utils.py | 2 +- ...auto_parallel_data_parallel_optimization.py | 2 +- .../passes/auto_parallel_recompute.py | 16 ++++++++-------- .../passes/auto_parallel_sharding.py | 4 ++-- .../distributed/passes/ps_trainer_pass.py | 4 ++-- .../ps/utils/collective_transpiler.py | 4 ++-- 36 files changed, 100 insertions(+), 100 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py index d7d98f75d80f1..b1ced07b8b24e 100644 --- a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py +++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py @@ -352,13 +352,13 @@ def convert_src_tensor_2_dst_tensor(vars_list, src_attr_map, dst_attr_map): if src_attr_map is None or len(src_attr_map) == 0: return vars_list[0] - dst_strategys = {} - src_strategys = {} + dst_strategies = {} + src_strategies = {} tensors_dict = {} convert_tensor_dict = None for var_name in src_attr_map.keys(): - assert var_name not in dst_strategys + assert var_name not in dst_strategies dist_vars = [] for vars in vars_list: if var_name in vars.keys(): @@ -367,13 +367,13 @@ def convert_src_tensor_2_dst_tensor(vars_list, src_attr_map, dst_attr_map): continue if var_name in dst_attr_map and var_name in src_attr_map: - dst_strategys[var_name] = copy.deepcopy(dst_attr_map[var_name]) - src_strategys[var_name] = copy.deepcopy(src_attr_map[var_name]) + dst_strategies[var_name] = copy.deepcopy(dst_attr_map[var_name]) + src_strategies[var_name] = copy.deepcopy(src_attr_map[var_name]) tensors_dict[var_name] = dist_vars if src_attr_map == dst_attr_map: return tensors_dict - converter = Converter(tensors_dict, src_strategys, dst_strategys) + converter = Converter(tensors_dict, src_strategies, dst_strategies) convert_tensor_dict = converter.convert() return convert_tensor_dict diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py index a671582a3293f..900b90a0f6496 100644 --- a/python/paddle/distributed/auto_parallel/static/completion.py +++ b/python/paddle/distributed/auto_parallel/static/completion.py @@ -202,7 +202,7 @@ def _update_op_dims_mapping_and_distoperatorimpl( updated = dist_op_container.update_dims_mapping(dist_op) changed = updated or changed - # TODO(ljz) remove the below code once we introduce general reshard to replace specifc distopimpls + # TODO(ljz) remove the below code once we introduce general reshard to replace specific distopimpls reverted = dist_op_container.mapping_to_dist_operator_impl( dist_op, original_op_dist_attr ) diff --git a/python/paddle/distributed/auto_parallel/static/converter.py b/python/paddle/distributed/auto_parallel/static/converter.py index c7cd4e32d6e42..241a83aaf4f5d 100644 --- a/python/paddle/distributed/auto_parallel/static/converter.py +++ b/python/paddle/distributed/auto_parallel/static/converter.py @@ -105,9 +105,9 @@ def convert(self, strict=True): >>> import numpy as np >>> from paddle.distributed.auto_parallel.static.converter import Converter >>> complete_tensors = np.arange(4).reshape([2, 2]) - >>> partitial_tensors = np.split(complete_tensors, 2, axis=0) + >>> partial_tensors = np.split(complete_tensors, 2, axis=0) >>> name = "tmp_0" - >>> tensors_dict = {name: partitial_tensors} + >>> tensors_dict = {name: partial_tensors} >>> strategy_1 = { ... name: { ... "process_shape": [2], @@ -345,7 +345,7 @@ def slice_with_dist_attr(tensor, dist_attr): @staticmethod def merge(partition_tensor_list, tensor, partition_index, complete_shape): """ - Merge partitial tensors to a complete. + Merge partial tensors to a complete. Returns: None diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py index 957e5dba46bf0..495cff26844d7 100644 --- a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py +++ b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py @@ -784,7 +784,7 @@ def comm_count(self): shape = None if self.op is not None: vars = self.op.block.vars - # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided + # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overridden try: var_name = self.op.input("X")[0] except: diff --git a/python/paddle/distributed/auto_parallel/static/cost_model.py b/python/paddle/distributed/auto_parallel/static/cost_model.py index 55690e4f3de8f..ad0f353815772 100644 --- a/python/paddle/distributed/auto_parallel/static/cost_model.py +++ b/python/paddle/distributed/auto_parallel/static/cost_model.py @@ -98,18 +98,18 @@ def init_comm_cost(self, cluster=None): # should get from `cluster` BANDWIDTH = 32 * 1024 / 1000 # MB/ms, V100 PCIe num_ranks = len(self.ranks) - comm_volumn = np.prod(self.input_shape) * 4 + comm_volume = np.prod(self.input_shape) * 4 if 'allreduce' in self.comm_type: - self._cost = comm_volumn / ( + self._cost = comm_volume / ( BANDWIDTH * num_ranks / (2 * (num_ranks - 1)) ) elif 'gather' in self.comm_type: - self._cost = comm_volumn / (BANDWIDTH * num_ranks / (num_ranks - 1)) + self._cost = comm_volume / (BANDWIDTH * num_ranks / (num_ranks - 1)) elif 'broadcast' in self.comm_type: - self._cost = comm_volumn / BANDWIDTH + self._cost = comm_volume / BANDWIDTH elif 'send' in self.comm_type or 'recv' in self.comm_type: - self._cost = comm_volumn / BANDWIDTH + self._cost = comm_volume / BANDWIDTH else: self._cost = 0 diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py index c730a68e6ae49..e7bd7553d5094 100644 --- a/python/paddle/distributed/auto_parallel/static/helper.py +++ b/python/paddle/distributed/auto_parallel/static/helper.py @@ -211,8 +211,8 @@ class ProgramHelper: def __init__(self, layer, loss_func, metrics, inputs_spec, labels_spec): # original model config information - # TODO(Aurelius84): Implenet append_backward and optimizer in ProxyLayer - # after distribute engine satisify basic condition. + # TODO(Aurelius84): Implement append_backward and optimizer in ProxyLayer + # after distribute engine satisfy basic condition. self.proxy_layer = ProxyLayer(layer, loss_func, metrics) self.inputs_spec = inputs_spec self.labels_spec = labels_spec diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py index fb924288988d1..27a13fd1d9107 100644 --- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py +++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py @@ -231,7 +231,7 @@ def _generate_backward( # NOTE(zhaoyinglia): # Guarantee the order of params_grads is same between dynamic mode and static mode # by making parameter_list equal to model.parameters(), - # because the order affact the result of ClipGradByGLobalNorm. + # because the order affect the result of ClipGradByGLobalNorm. # If parameter_list is not None, the order of params_grads is same with parameter_list. # If parameter_list is None, params_grads will be as prog.global_block().all_parameters(). with program_guard(main_program, startup_program): diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 2c3c4728d4f2e..62b79302f32dd 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -1498,7 +1498,7 @@ def sharding_configs(self): This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology. Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 . - segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. + segment_anchors(list): list of anchors used to segment the program, which allows a finer control of program segmentation. this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors. sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1. Default is 8. diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index 81547d24878d5..c9ea552815a83 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -1194,7 +1194,7 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): dirname(str, optional): The saving directory path. When you need to save the parameter to the memory, set it to None. - main_program(Program, optional): The program whose persistbale tensors will + main_program(Program, optional): The program whose persistable tensors will be saved. Default: None. @@ -1419,7 +1419,7 @@ def amp_init( ... init_loss_scaling=128.0, ... use_dynamic_loss_scaling=True, ... use_pure_fp16=True) - ... # If you don't use the default_startup_program(), you sholud pass + ... # If you don't use the default_startup_program(), you should pass ... # your defined `startup_program` into `minimize`. ... optimizer.minimize(loss) ... exe.run(paddle.static.default_startup_program()) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index dcb5e55f0c25a..146d8a627e5c5 100755 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -115,7 +115,7 @@ def _parse_args(): "--backend", type=str, default=os.environ.get('PADDLE_DISTRI_BACKEND', 'auto'), - help="Specifize the backend, can be gloo|nccl|bkcl|auto|heter. " + help="Specify the backend, can be gloo|nccl|bkcl|auto|heter. " "Default value is auto which prefers nccl or bkcl.", ) base_group.add_argument( diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 0b87df4a9c3af..c0a01d43fd688 100755 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -339,12 +339,12 @@ def terminate_local_procs(procs): p.log_fn.close() logger.debug(f"terminate process id:{p.proc.pid}") - # wait all process terminiated + # wait all process terminated time.sleep(3) for step in range(0, 50): alive = False for p in procs: - if p.proc.poll() is None: # not termniate + if p.proc.poll() is None: # not terminate os.kill(p.proc.pid, signal.SIGKILL) alive = True @@ -414,7 +414,7 @@ def __free_port(): step += 1 if step > 400: print( - "can't find avilable port and use the specified static port now!" + "can't find available port and use the specified static port now!" ) return None @@ -705,7 +705,7 @@ def get_gpus(gpus): for x in gpus.split(',') ] logger.info( - f"Change selected_gpus into reletive values. --ips:{gpus} " + f"Change selected_gpus into relative values. --ips:{gpus} " f"will change into relative_ips:{res_gpus} according to your " f"CUDA_VISIBLE_DEVICES:{cuda_visible_devices_list}" ) @@ -736,7 +736,7 @@ def get_xpus(xpus): for x in xpus.split(',') ] logger.info( - f"Change selected_xpus into reletive values. --ips:{xpus} " + f"Change selected_xpus into relative values. --ips:{xpus} " f"will change into relative_ips:{res_xpus} according to your " f"XPU_VISIBLE_DEVICES:{xpu_visible_devices_list}" ) @@ -859,9 +859,9 @@ def get_custom_endpoints(origin_endpoints, offset=0): # assert paddle_pserver_endpoints != None # # # hard code for paddlecloud custom-framework -# avilable_ports = os.getenv("TRAINER_PORTS", "").split(",") +# available_ports = os.getenv("TRAINER_PORTS", "").split(",") # assert len( -# avilable_ports +# available_ports # ) >= 2, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit" # # # hard code for paddlecloud custom-framework diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py index 4924d523ded05..36833fd7b5a97 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py @@ -73,7 +73,7 @@ def _unscale(self, optimizer): if not self._use_dp_mode: self._found_inf = paddle.cast(self._found_inf, dtype="int32") # TODO(shenliang03) Since the minimize call in the optimizer is - # after the gradscaler, check_finite needs to synchronize global + # after the grad scaler, check_finite needs to synchronize global # information. In the future, we should use check_group paddle.distributed.all_reduce( self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 05f2a4f2a28d6..852e7ced16e4a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -103,7 +103,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1): - 1: sync_calc - 2: reduce_sum_sharding (allreduce --> reduce) - 3: sync_comm - - 4: allreuce_sum_dp (dp_grads) + - 4: allreduce_sum_dp (dp_grads) - 5: sync_comm (dp_grads) - 6: op that use Var (dp_grads & sum) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py index 2ff259be18b79..1c10efb340618 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py @@ -32,7 +32,7 @@ def prune_weight_decay(self, block, shard): continue if OP_ROLE_VAR_KEY not in op.attr_names: raise ValueError( - "The Weight Dacay op should hold op_role_var attribute" + "The Weight Decay op should hold op_role_var attribute" f"but the {op.type} op does not hold op_role_var" ) op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 5d2f561ca974d..298e84ace66f1 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -217,7 +217,7 @@ def _get_hybrid_dp_mode(self): # pipeline: communication across nodes, and therefore should insert in update segment, # conduct just once per global step. dp_mode = None - # dp here is the pure dp as the outest parallelism + # dp here is the pure dp as the outermost parallelism if self.hybrid_dp: if self.pp_degree > 1: dp_mode = "pp_hybrid_dp" @@ -598,8 +598,8 @@ def _adapt_amp_clip_without_sharding(self): rings = [self.mp_ring_id, self.pp_ring_id] FP16Utils.sync_amp_check_nan_inf(main_block, rings) - gradientclip_helper = GradientClipHelper(None) - gradientclip_helper.sync_global_norm( + gradient_clip_helper = GradientClipHelper(None) + gradient_clip_helper.sync_global_norm( main_block, [self.mp_ring_id, self.pp_ring_id], self.mp_rank ) @@ -996,8 +996,8 @@ def _prune_main_program(self, block, shard, rings): 4. prune optimizer op + param + gradient """ - weightdecay_helper = WeightDecayHelper() - weightdecay_helper.prune_weight_decay(block, shard) + weight_decay_helper = WeightDecayHelper() + weight_decay_helper.prune_weight_decay(block, shard) # FIXME(wangxi): mp should prune duplicated param_grads # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism @@ -1006,8 +1006,8 @@ def _prune_main_program(self, block, shard, rings): FP16Utils.prune_fp16(block, shard, self._reduced_grads_to_param, rings) # clipbyglobalnorm should only use the Model parallelism group (mp-sharding-pp) - gradientclip_helper = GradientClipHelper(None) - gradientclip_helper.prune_gradient_clip(block, shard, rings) + gradient_clip_helper = GradientClipHelper(None) + gradient_clip_helper.prune_gradient_clip(block, shard, rings) # build prog deps reduced_grads = [] @@ -1645,7 +1645,7 @@ def _build_groups(self): # global group # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm - # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree + # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be divided by dp_degree self.global_ring_id = 3 logger.info(f"global word size: {self.global_word_size}") @@ -1727,7 +1727,7 @@ def recreate_not_persist_param_as_var(program): def _initialization_broadcast(self): """ - this funtion is to ensure the initialization between dp group to be + this function is to ensure the initialization between dp group to be identical when hybrid-dp is used, and the initialization of not distributed param between mp group to be identical. """ diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 21e5dbfbffefc..384d89b4d9c12 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -170,7 +170,7 @@ def __init__(self, layers, hcg, strategy): 'accumulate_steps' ] # If sent tensor are not the same from different hosts, - # they shouldn't been sent partially and then concated as a whole tensor. + # they shouldn't been sent partially and then concatenated as a whole tensor. self._enable_partial_send_recv = self._strategy.pipeline_configs[ 'enable_partial_send_recv' ] @@ -640,7 +640,7 @@ def _prepare_training(self, data, optimizer, lr_scheduler): def _wrap_data(self, data): """ - for backward compatibilty, wrap data to Fake FakeMicroDataset if it is of type list or tuple + for backward compatibility, wrap data to Fake FakeMicroDataset if it is of type list or tuple """ if (not isinstance(data, tuple)) and (not isinstance(data, list)): return data diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 2a691c2c4d4fc..046143c79842f 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -342,6 +342,6 @@ def cvt_to_device(x, dev_id, blocking=True): place = paddle.XPUPlace(dev_id) else: raise OSError( - "Only supported compiled paddle with gpu/rocm and xpu , but current verison is compiled with cpu." + "Only supported compiled paddle with gpu/rocm and xpu, but current version is compiled with cpu." ) return x._copy_to(place, blocking) diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py index 8cfa7fbec353d..b59f304d69a42 100644 --- a/python/paddle/distributed/fleet/recompute/recompute.py +++ b/python/paddle/distributed/fleet/recompute/recompute.py @@ -93,7 +93,7 @@ def check_recompute_necessary(inputs): @contextlib.contextmanager -def swith_rng_state_tracker(rng_state, tracker): +def switch_rng_state_tracker(rng_state, tracker): orig_rng_state = paddle.get_rng_state() orig_rng_tracker = get_rng_state_tracker().get_states_tracker() paddle.set_rng_state(rng_state) @@ -155,8 +155,8 @@ def forward(ctx, run_function, preserve_rng_state, *args, **kwargs): ctx.inputs.append(arg) ctx.save_for_backward(*tensor_inputs) - # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu. - # one process with multiple gpu and mix-gpu-cpu senarios are not support + # NOTE recompute with restore RNG only support one scenario where one process for one cuda gpu. + # one process with multiple gpu and mix-gpu-cpu scenarios are not support if ctx.preserve_rng_state: ctx.fw_rng_state = paddle.get_rng_state() ctx.fwd_rng_state_tracker = ( @@ -208,7 +208,7 @@ def backward(ctx, *args): # NOTE support AMP # need restore auto_cast state as well as w/b list if ctx.preserve_rng_state: - with swith_rng_state_tracker( + with switch_rng_state_tracker( ctx.fw_rng_state, ctx.fwd_rng_state_tracker ): with paddle.amp.auto_cast( @@ -273,7 +273,7 @@ def backward(ctx, *args): # all tensors in the tuple doesn't need grad, only return a None for the whole tuple grads.append(None) else: - # all tensors in the tuple nees grad, should return a tuple of grads + # all tensors in the tuple need grad, should return a tuple of grads grads.append(tuple(i._grad_ivar() for i in inp)) if in_dynamic_mode(): @@ -303,7 +303,7 @@ def _recompute_without_reentrant( fw_cuda_rng_state = paddle.get_rng_state(cur_device) else: raise RuntimeError( - "Recompute with RNG perserve is not support current device: {}.".format( + "Recompute with RNG preserve is not support current device: {}.".format( cur_device ) ) @@ -358,10 +358,10 @@ def inner_pack(inner_x): return def inner_unpack(inner_x): - raise Exception("An unexcepted backward called on a tensor!") + raise Exception("An unexpected backward called on a tensor!") if preserve_rng_state: - with swith_rng_state_tracker( + with switch_rng_state_tracker( fw_cuda_rng_state, fwd_cuda_rng_state_tracker ): with paddle.set_grad_enabled(True): diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py index 789f0cac73d94..29e7c73459854 100644 --- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py +++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py @@ -22,7 +22,7 @@ from .recompute import ( check_recompute_necessary, detach_variable, - swith_rng_state_tracker, + switch_rng_state_tracker, ) __all__ = [] @@ -198,7 +198,7 @@ def backward(ctx, *args): tracer._has_grad = True # need restore auto_cast state as well as w/b list - with swith_rng_state_tracker( + with switch_rng_state_tracker( ctx.fwd_rng_state, ctx.fwd_rng_state_tracker ): if ctx.is_fw_autocast: diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py index 865571cfeca6f..f69470397e1d9 100644 --- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py +++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py @@ -43,7 +43,7 @@ def _set_basic_info(self, context): self.origin_main_program = context["origin_main_program"] self.origin_startup_program = context["origin_startup_program"] self.async_strategy = self._get_distributed_strategy() - self.compiled_strategy = self.build_compiled_startegy() + self.compiled_strategy = self.build_compiled_strategy() def _get_distributed_strategy(self): strategy = None @@ -69,7 +69,7 @@ def _get_distributed_strategy(self): return strategy - def build_compiled_startegy(self): + def build_compiled_strategy(self): from paddle.incubate.distributed.fleet.parameter_server.ir.public import ( CompileTimeStrategy, ) @@ -203,7 +203,7 @@ def get_sparse_attrs(): if len(dist_varnames) != 0: raise ValueError( - "GeoStrategy can not support large scale embeding now, please use paddle.static.nn.embedding" + "GeoStrategy can not support large scale embedding now, please use paddle.static.nn.embedding" ) init_attrs = [] @@ -354,11 +354,11 @@ def _init_server(self, *args, **kwargs): sparse_related_optimize_varnames = list( set(sparse_related_optimize_varnames) ) - distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps( + distributed_varnames = self.compiled_strategy.get_sparse_varname_on_ps( True ) distributed_related_optimize_varnames = [] - for var_name in distribtued_varnames: + for var_name in distributed_varnames: distributed_related_optimize_varnames += ( self.compiled_strategy.get_optimize_varname_on_ps(var_name) ) @@ -370,7 +370,7 @@ def _init_server(self, *args, **kwargs): filter( ParameterServerRuntime.__exclude_vars( sparse_varnames - + distribtued_varnames + + distributed_varnames + sparse_related_optimize_varnames + distributed_related_optimize_varnames ), @@ -402,7 +402,7 @@ def _init_server(self, *args, **kwargs): # load large scale self._load_distributed_params( dirname=model_dirname, - varnames=distribtued_varnames + varnames=distributed_varnames + distributed_related_optimize_varnames, ) diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index a14c337a4fad1..94d403765b1a0 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -684,7 +684,7 @@ def _set_basic_info(self, context): self.origin_main_program = context["origin_main_program"] self.origin_startup_program = context["origin_startup_program"] self.async_strategy = self._get_distributed_strategy() - self.compiled_strategy = self.build_compiled_startegy() + self.compiled_strategy = self.build_compiled_strategy() def _get_distributed_strategy(self): strategy = None @@ -712,7 +712,7 @@ def _get_distributed_strategy(self): strategy.use_ps_gpu = True return strategy - def build_compiled_startegy(self): + def build_compiled_strategy(self): from paddle.incubate.distributed.fleet.parameter_server.ir.public import ( CompileTimeStrategy, ) @@ -1125,8 +1125,8 @@ def _get_tables(): if len(tensor_table_dict) > 0: tables = _add_tensor_table(tables) else: - empty_porgram = Program() - self._server_sub_program.append(empty_porgram.desc) + empty_program = Program() + self._server_sub_program.append(empty_program.desc) barrier_table = _build_barrier_table(len(tables)) tables.append(barrier_table) diff --git a/python/paddle/distributed/fleet/scaler.py b/python/paddle/distributed/fleet/scaler.py index 40e182e7f2e40..977b336eb31bb 100755 --- a/python/paddle/distributed/fleet/scaler.py +++ b/python/paddle/distributed/fleet/scaler.py @@ -139,7 +139,7 @@ def unscale_method(self, optimizer): self._found_inf = self._found_inf.cast("int32") # TODO(shenliang03) Since dp allreduce in the optimizer is - # after the gradscaler, check_finite needs to synchronize global + # after the grad scaler, check_finite needs to synchronize global # information. In the future, we should use check_group to speed. paddle.distributed.all_reduce( self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index aa7ec2e544efe..5c2ec7fece24d 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -994,7 +994,7 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True): fs_src_path(str): Name of the file or directory, that's needed to be moved. fs_dst_path(str): Name of the file or directory to which to move to. overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False. - test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. + test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Exception. Examples: @@ -1177,7 +1177,7 @@ def _split_files(self, files, trainer_id, trainers): trainer_id(int): trainer mpi rank id trainers(int): all trainers num Returns: - fileist(list): file list of current trainer + filelist(list): file list of current trainer """ remainder = len(files) % trainers blocksize = len(files) // trainers @@ -1200,7 +1200,7 @@ def list_files_info(self, path_list): Args: path_list(list): file list Returns: - fileist(list): file list with file path and size + filelist(list): file list with file path and size """ if len(path_list) <= 0: return [] @@ -1650,7 +1650,7 @@ def _split_files(self, files, trainer_id, trainers): trainer_id(int): trainer mpi rank id trainers(int): all trainers num Returns: - fileist(list): file list of current trainer + filelist(list): file list of current trainer """ remainder = len(files) % trainers blocksize = len(files) // trainers diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py index d8142b7081f2b..38e6eeca008d6 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py @@ -63,7 +63,7 @@ class HybridParallelInferenceHelper: ... with paddle.base.device_guard(f'{device}:all'): ... # read data from global lod_tensor_array ... element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx) - ... # write placehold data to global lod_tensor_array, + ... # write placeholder data to global lod_tensor_array, ... # it need for send_v2 of lod_tensor_array ... paddle.increment(x=step_idx, value=1.0) ... paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr) @@ -455,7 +455,7 @@ def _find_prev_op(self, index, var_name): def _add_op_device_attr(self, block): """ - Add op_device attrribute for ops in block that have + Add op_device attribute for ops in block that have not that attribute set. Args: diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index fc0f897b1454c..27aa4c9f54074 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -263,7 +263,7 @@ def fused_allreduce_gradients(parameter_list, hcg): def broadcast_sharding_parameters(model, hcg): - # TODO TO save memory, use un-fused broadcast to avoid potentional OOM + # TODO TO save memory, use un-fused broadcast to avoid potential OOM logger.debug("sharding start init parameters sync") sharding_parallel_group = hcg.get_sharding_parallel_group() src_rank = hcg.get_sharding_parallel_group_src_rank() @@ -273,7 +273,7 @@ def broadcast_sharding_parameters(model, hcg): def broadcast_sep_parameters(model, hcg): - # TODO TO save memory, use un-fused broadcast to avoid potentional OOM + # TODO TO save memory, use un-fused broadcast to avoid potential OOM logger.debug("sep start init parameters sync") sep_group = hcg.get_sep_parallel_group() src_rank = hcg.get_sep_parallel_group_src_rank() diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py index 7b4ff7a0410e5..bbc632029a59b 100644 --- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py +++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py @@ -47,7 +47,7 @@ def __init__(self, layers, dtype="float16"): param._register_grad_hook(self._update_main_grad_hook(param)) def _update_main_grad_hook(self, param): - """Create the update_main_grad hook for backprop.""" + """Create the update_main_grad hook for back-prop.""" # Hook used for back-prop and grad-merge. @paddle.autograd.no_grad() diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index 959f9eb49f40f..dff62c1a22db1 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -783,7 +783,7 @@ def fused_parameters( :param fuse_param: fuse param or not :param scale_after_comm: if enable comm overlap, specify the location of grad scale :param group_params: the format of the input parameters is param group - :param apply_decay_param_fun: the funtion to filter decay param + :param apply_decay_param_fun: the function to filter decay param :return: param storage if fused, comm buffers if comm overlap, param groups if use group params """ if act is None: diff --git a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py index 9ca0a7fdfc89f..88cb6ff27b1aa 100644 --- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py +++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py @@ -44,7 +44,7 @@ def tensor_parallel_sync_filter_fn( param, pos_emb=True, layer_norm=True, bias=True ): """ - Layer fliter function for tensor parallelism transformer. + Layer filter function for tensor parallelism transformer. In tensor parallelism of transformer like model, there is 4 kind of param that are supposed to be the same in all tensor parallel peers: @@ -111,7 +111,7 @@ def copy_parameters(block_, params): ) assert ( param.is_distributed is False - ), f"Try to sync Distribted Parameter: {param}" + ), f"Try to sync Distributed Parameter: {param}" new_p.is_distributed = False block_.vars[new_p.name] = new_p @@ -291,7 +291,7 @@ def add_extra_synchronization( sync_mode(string): select from "broadcast": parameter is sync by broadcasted from 'src_rank' to all other ranks. - "average": paramter is sync by average amonge all ranks + "average": parameter is sync by average among all ranks src_rank(int): the src used in broadcast sync_mode. @@ -324,7 +324,7 @@ def add_extra_synchronization( if params_filter_fn(param): params_to_sync.append(param) logger.info( - "The following param are goning to be synchronization everytime the optimizer update phase of the program is runned: " + "The following param are going to be synchronization everytime the optimizer update phase of the program is runned: " ) logger.info([p.name for p in params_to_sync]) diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py index 0c326c91f5cc6..3bee69f5d7deb 100644 --- a/python/paddle/distributed/launch/context/__init__.py +++ b/python/paddle/distributed/launch/context/__init__.py @@ -91,7 +91,7 @@ def get_logger(self, level=logging.INFO): logger.addHandler(ch) return logger - def continous_log(self) -> bool: + def continuous_log(self) -> bool: if self.args.log_level.upper() in ['DEBUG', 'ERROR']: return True else: @@ -102,6 +102,6 @@ def set_env_in_args(self): attr, attr_type = v if k in self.envs: print( - f"LAUNCH WARNNING args {attr} will be overridden by env: {k} value: {self.envs[k]}" + f"LAUNCH WARNING args {attr} will be overridden by env: {k} value: {self.envs[k]}" ) setattr(self.args, attr, attr_type(self.envs[k])) diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index 4553ea1bb776b..e6eae1a94e3f6 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -95,7 +95,7 @@ def watch(self) -> bool: while not self.ctx.status.is_done(): status = self.pod.watch(timeout=2) - # if self.ctx.continous_log(): + # if self.ctx.continuous_log(): # default to print log self.pod.logs() diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py index 5a2009b2fd0f2..4ebda8bc64c25 100644 --- a/python/paddle/distributed/models/moe/utils.py +++ b/python/paddle/distributed/models/moe/utils.py @@ -59,7 +59,7 @@ def _number_count(numbers, upper_range): def _assign_pos(x, cum_count): """ Assign pos decides which tokens should be fetched belong to - specially expert orderingly. + specially expert orderly. Args: x (Tensor): Tensor. Every element in the list must be a Tensor whose data type diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py index f2b2c140cd6cf..c820a3d882274 100644 --- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py +++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py @@ -63,7 +63,7 @@ class DataParallelOptimizationPass(PassBase): def __init__(self): super().__init__() - # NOTE not use depence on loss and param_grads + # NOTE not use dependence on loss and param_grads self.set_attr("dist_context", None) self.set_attr("global_rank", -1) self.set_attr("use_sharding", False) diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py index 9fe72c8aabd75..822bdbd6801b2 100644 --- a/python/paddle/distributed/passes/auto_parallel_recompute.py +++ b/python/paddle/distributed/passes/auto_parallel_recompute.py @@ -293,7 +293,7 @@ def _check_self(self): def _check_conflict(self, other_pass): return True - def get_ops_per_device(self, ops, all_ops_process_meshs, sr=0): + def get_ops_per_device(self, ops, all_ops_process_meshes, sr=0): """ Get ops and op_names of each process mesh excluding ops within the first "sr" chunks """ @@ -302,7 +302,7 @@ def reset_recompute_op(op): if is_recompute_op(op) or is_recompute_exclude_op(op): op._set_attr("op_namescope", "") - all_process_meshes_count = len(all_ops_process_meshs) + all_process_meshes_count = len(all_ops_process_meshes) ops_of_stages = [[] for _ in range(all_process_meshes_count)] op_names_of_stages = [[] for _ in range(all_process_meshes_count)] pushed_ops_count = 0 @@ -321,7 +321,7 @@ def reset_recompute_op(op): if chunk_id // all_process_meshes_count < sr: continue - for id, process_mesh in enumerate(all_ops_process_meshs): + for id, process_mesh in enumerate(all_ops_process_meshes): if op.dist_attr.process_mesh == process_mesh: pushed_ops_count += 1 ops_of_stages[id].append(op) @@ -346,15 +346,15 @@ def _apply_single_impl(self, main_program, startup_program, context): op_path = _find_op_path(main_program, loss, no_grad_set) # 1. mark exclude ops for refined-recompute according to ops-patterns(mainly linear and flash_attn) - # 1.1 get all process_meshs in op_path - all_ops_process_meshs = [] + # 1.1 get all process_meshes in op_path + all_ops_process_meshes = [] for op in op_path: - if op.dist_attr.process_mesh not in all_ops_process_meshs: - all_ops_process_meshs.append(op.dist_attr.process_mesh) + if op.dist_attr.process_mesh not in all_ops_process_meshes: + all_ops_process_meshes.append(op.dist_attr.process_mesh) # 1.2 get ops_devices and op_names_devices ops_devices, op_names_devices = self.get_ops_per_device( - op_path, all_ops_process_meshs, self._sr + op_path, all_ops_process_meshes, self._sr ) all_ops_len = len(op_path) all_exclude_ops_ids = [[] for _ in op_names_devices] diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py index adddb37d26b43..617425158dd89 100644 --- a/python/paddle/distributed/passes/auto_parallel_sharding.py +++ b/python/paddle/distributed/passes/auto_parallel_sharding.py @@ -1187,7 +1187,7 @@ def _overlap_grad_comm( 2.2 insert after communication dependencies only when need 3. there is not need to add explicit dependencies for non-coalesce gradient communication - P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream awared allocator. + P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream award allocator. """ if not self.enable_overlap: @@ -1309,7 +1309,7 @@ def _overlap_grad_comm( # hierarchical grad comm if self.enable_hierarchical_comm: # NOTE so far we only support Isomorphic cluster with 8 ranks per node - # TODO unifiy here create communicators + # TODO unify here create communicators # create communicators nranks_per_node = 8 assert self.sharding_world_size % nranks_per_node == 0 diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py index 113f5275d8e7b..eb3e0368c49a8 100755 --- a/python/paddle/distributed/passes/ps_trainer_pass.py +++ b/python/paddle/distributed/passes/ps_trainer_pass.py @@ -890,8 +890,8 @@ def _create_heter_program( # joint_var.0_1 -> slice -> reshape -> origin_var # origin_var -> origin_program # reshape -> concat -> joint_var.1_2 - # d) copy send op from origin program for var@grad which loacted in current heter block - # e) re-check every op in current blcok if its device is not current heter devie + # d) copy send op from origin program for var@grad which located in current heter block + # e) re-check every op in current block if its device is not current heter device # 2. Create send op for step counter in last heter-block # 3. Create Listen&Serv OP and Send&Recv OP for distributed training # 4. update CompileTimeStrategy for heter_program diff --git a/python/paddle/distributed/ps/utils/collective_transpiler.py b/python/paddle/distributed/ps/utils/collective_transpiler.py index 7f398842fd701..8d0ff9a53e551 100644 --- a/python/paddle/distributed/ps/utils/collective_transpiler.py +++ b/python/paddle/distributed/ps/utils/collective_transpiler.py @@ -357,7 +357,7 @@ def _insert_allreduce_ops(self): ) offset += 1 - # As we search ops reversedly, we should insert c_allreduce_sum + # As we search ops reversely, we should insert c_allreduce_sum # op in the same way to keep the ring_id alternate ring_id = (ring_id + 1) % self.nrings block._insert_op( @@ -631,7 +631,7 @@ def _insert_allgather_ops(self): ) offset += 1 - # As we search ops reversedly, we should insert c_allgather + # As we search ops reversely, we should insert c_allgather # op in the same way to keep the ring_id alternate ring_id = (ring_id + 1) % self.nrings block._insert_op(