From 3f00547fb9bbe9779e268e52e65f85b0374b5e97 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:29:40 +0800
Subject: [PATCH]  Fix some typos (dst_strategys, etc.) (#62003)

---
 .../auto_parallel/static/auto_align_tool.py    | 12 ++++++------
 .../auto_parallel/static/completion.py         |  2 +-
 .../auto_parallel/static/converter.py          |  6 +++---
 .../auto_parallel/static/cost/base_cost.py     |  2 +-
 .../auto_parallel/static/cost_model.py         | 10 +++++-----
 .../distributed/auto_parallel/static/helper.py |  4 ++--
 .../auto_parallel/static/parallelizer_v2.py    |  2 +-
 .../fleet/base/distributed_strategy.py         |  2 +-
 python/paddle/distributed/fleet/fleet.py       |  4 ++--
 python/paddle/distributed/fleet/launch.py      |  2 +-
 .../paddle/distributed/fleet/launch_utils.py   | 14 +++++++-------
 .../hybrid_parallel_gradscaler.py              |  2 +-
 .../fleet/meta_optimizers/sharding/utils.py    |  2 +-
 .../sharding/weight_decay_helper.py            |  2 +-
 .../meta_optimizers/sharding_optimizer.py      | 18 +++++++++---------
 .../fleet/meta_parallel/pipeline_parallel.py   |  4 ++--
 .../sharding/group_sharded_utils.py            |  2 +-
 .../distributed/fleet/recompute/recompute.py   | 16 ++++++++--------
 .../fleet/recompute/recompute_hybrid.py        |  4 ++--
 .../fleet/runtime/parameter_server_runtime.py  | 14 +++++++-------
 .../distributed/fleet/runtime/the_one_ps.py    |  8 ++++----
 python/paddle/distributed/fleet/scaler.py      |  2 +-
 python/paddle/distributed/fleet/utils/fs.py    |  8 ++++----
 .../fleet/utils/hybrid_parallel_inference.py   |  4 ++--
 .../fleet/utils/hybrid_parallel_util.py        |  4 ++--
 .../fleet/utils/mix_precision_utils.py         |  2 +-
 .../fleet/utils/tensor_fusion_helper.py        |  2 +-
 .../fleet/utils/tensor_parallel_utils.py       |  8 ++++----
 .../distributed/launch/context/__init__.py     |  4 ++--
 .../launch/controllers/controller.py           |  2 +-
 python/paddle/distributed/models/moe/utils.py  |  2 +-
 ...auto_parallel_data_parallel_optimization.py |  2 +-
 .../passes/auto_parallel_recompute.py          | 16 ++++++++--------
 .../passes/auto_parallel_sharding.py           |  4 ++--
 .../distributed/passes/ps_trainer_pass.py      |  4 ++--
 .../ps/utils/collective_transpiler.py          |  4 ++--
 36 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
index d7d98f75d80f1..b1ced07b8b24e 100644
--- a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
+++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
@@ -352,13 +352,13 @@ def convert_src_tensor_2_dst_tensor(vars_list, src_attr_map, dst_attr_map):
         if src_attr_map is None or len(src_attr_map) == 0:
             return vars_list[0]
 
-        dst_strategys = {}
-        src_strategys = {}
+        dst_strategies = {}
+        src_strategies = {}
         tensors_dict = {}
 
         convert_tensor_dict = None
         for var_name in src_attr_map.keys():
-            assert var_name not in dst_strategys
+            assert var_name not in dst_strategies
             dist_vars = []
             for vars in vars_list:
                 if var_name in vars.keys():
@@ -367,13 +367,13 @@ def convert_src_tensor_2_dst_tensor(vars_list, src_attr_map, dst_attr_map):
                 continue
 
             if var_name in dst_attr_map and var_name in src_attr_map:
-                dst_strategys[var_name] = copy.deepcopy(dst_attr_map[var_name])
-                src_strategys[var_name] = copy.deepcopy(src_attr_map[var_name])
+                dst_strategies[var_name] = copy.deepcopy(dst_attr_map[var_name])
+                src_strategies[var_name] = copy.deepcopy(src_attr_map[var_name])
                 tensors_dict[var_name] = dist_vars
 
         if src_attr_map == dst_attr_map:
             return tensors_dict
-        converter = Converter(tensors_dict, src_strategys, dst_strategys)
+        converter = Converter(tensors_dict, src_strategies, dst_strategies)
         convert_tensor_dict = converter.convert()
 
         return convert_tensor_dict
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index a671582a3293f..900b90a0f6496 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -202,7 +202,7 @@ def _update_op_dims_mapping_and_distoperatorimpl(
 
     updated = dist_op_container.update_dims_mapping(dist_op)
     changed = updated or changed
-    # TODO(ljz) remove the below code once we introduce general reshard to replace specifc distopimpls
+    # TODO(ljz) remove the below code once we introduce general reshard to replace specific distopimpls
     reverted = dist_op_container.mapping_to_dist_operator_impl(
         dist_op, original_op_dist_attr
     )
diff --git a/python/paddle/distributed/auto_parallel/static/converter.py b/python/paddle/distributed/auto_parallel/static/converter.py
index c7cd4e32d6e42..241a83aaf4f5d 100644
--- a/python/paddle/distributed/auto_parallel/static/converter.py
+++ b/python/paddle/distributed/auto_parallel/static/converter.py
@@ -105,9 +105,9 @@ def convert(self, strict=True):
                 >>> import numpy as np
                 >>> from paddle.distributed.auto_parallel.static.converter import Converter
                 >>> complete_tensors = np.arange(4).reshape([2, 2])
-                >>> partitial_tensors = np.split(complete_tensors, 2, axis=0)
+                >>> partial_tensors = np.split(complete_tensors, 2, axis=0)
                 >>> name = "tmp_0"
-                >>> tensors_dict = {name: partitial_tensors}
+                >>> tensors_dict = {name: partial_tensors}
                 >>> strategy_1 = {
                 ...     name: {
                 ...         "process_shape": [2],
@@ -345,7 +345,7 @@ def slice_with_dist_attr(tensor, dist_attr):
     @staticmethod
     def merge(partition_tensor_list, tensor, partition_index, complete_shape):
         """
-        Merge partitial tensors to a complete.
+        Merge partial tensors to a complete.
 
         Returns:
             None
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
index 957e5dba46bf0..495cff26844d7 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -784,7 +784,7 @@ def comm_count(self):
             shape = None
             if self.op is not None:
                 vars = self.op.block.vars
-                # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided
+                # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overridden
                 try:
                     var_name = self.op.input("X")[0]
                 except:
diff --git a/python/paddle/distributed/auto_parallel/static/cost_model.py b/python/paddle/distributed/auto_parallel/static/cost_model.py
index 55690e4f3de8f..ad0f353815772 100644
--- a/python/paddle/distributed/auto_parallel/static/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/static/cost_model.py
@@ -98,18 +98,18 @@ def init_comm_cost(self, cluster=None):
         # should get from `cluster`
         BANDWIDTH = 32 * 1024 / 1000  # MB/ms, V100 PCIe
         num_ranks = len(self.ranks)
-        comm_volumn = np.prod(self.input_shape) * 4
+        comm_volume = np.prod(self.input_shape) * 4
 
         if 'allreduce' in self.comm_type:
-            self._cost = comm_volumn / (
+            self._cost = comm_volume / (
                 BANDWIDTH * num_ranks / (2 * (num_ranks - 1))
             )
         elif 'gather' in self.comm_type:
-            self._cost = comm_volumn / (BANDWIDTH * num_ranks / (num_ranks - 1))
+            self._cost = comm_volume / (BANDWIDTH * num_ranks / (num_ranks - 1))
         elif 'broadcast' in self.comm_type:
-            self._cost = comm_volumn / BANDWIDTH
+            self._cost = comm_volume / BANDWIDTH
         elif 'send' in self.comm_type or 'recv' in self.comm_type:
-            self._cost = comm_volumn / BANDWIDTH
+            self._cost = comm_volume / BANDWIDTH
         else:
             self._cost = 0
 
diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
index c730a68e6ae49..e7bd7553d5094 100644
--- a/python/paddle/distributed/auto_parallel/static/helper.py
+++ b/python/paddle/distributed/auto_parallel/static/helper.py
@@ -211,8 +211,8 @@ class ProgramHelper:
 
     def __init__(self, layer, loss_func, metrics, inputs_spec, labels_spec):
         # original model config information
-        # TODO(Aurelius84): Implenet append_backward and optimizer in ProxyLayer
-        # after distribute engine satisify basic condition.
+        # TODO(Aurelius84): Implement append_backward and optimizer in ProxyLayer
+        # after distribute engine satisfy basic condition.
         self.proxy_layer = ProxyLayer(layer, loss_func, metrics)
         self.inputs_spec = inputs_spec
         self.labels_spec = labels_spec
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index fb924288988d1..27a13fd1d9107 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -231,7 +231,7 @@ def _generate_backward(
         # NOTE(zhaoyinglia):
         # Guarantee the order of params_grads is same between dynamic mode and static mode
         # by making parameter_list equal to model.parameters(),
-        # because the order affact the result of ClipGradByGLobalNorm.
+        # because the order affect the result of ClipGradByGLobalNorm.
         # If parameter_list is not None, the order of params_grads is same with parameter_list.
         # If parameter_list is None, params_grads will be as prog.global_block().all_parameters().
         with program_guard(main_program, startup_program):
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 2c3c4728d4f2e..62b79302f32dd 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1498,7 +1498,7 @@ def sharding_configs(self):
             This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
             Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
 
-            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
+            segment_anchors(list): list of anchors used to segment the program, which allows a finer control of program segmentation.
             this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
 
             sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 81547d24878d5..c9ea552815a83 100755
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -1194,7 +1194,7 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
 
             dirname(str, optional): The saving directory path.
                                 When you need to save the parameter to the memory, set it to None.
-            main_program(Program, optional): The program whose persistbale tensors will
+            main_program(Program, optional): The program whose persistable tensors will
                                              be saved. Default: None.
 
 
@@ -1419,7 +1419,7 @@ def amp_init(
                 ...         init_loss_scaling=128.0,
                 ...         use_dynamic_loss_scaling=True,
                 ...         use_pure_fp16=True)
-                ...     # If you don't use the default_startup_program(), you sholud pass
+                ...     # If you don't use the default_startup_program(), you should pass
                 ...     # your defined `startup_program` into `minimize`.
                 ...     optimizer.minimize(loss)
                 ...     exe.run(paddle.static.default_startup_program())
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index dcb5e55f0c25a..146d8a627e5c5 100755
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -115,7 +115,7 @@ def _parse_args():
         "--backend",
         type=str,
         default=os.environ.get('PADDLE_DISTRI_BACKEND', 'auto'),
-        help="Specifize the backend, can be gloo|nccl|bkcl|auto|heter. "
+        help="Specify the backend, can be gloo|nccl|bkcl|auto|heter. "
         "Default value is auto which prefers nccl or bkcl.",
     )
     base_group.add_argument(
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 0b87df4a9c3af..c0a01d43fd688 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -339,12 +339,12 @@ def terminate_local_procs(procs):
                 p.log_fn.close()
             logger.debug(f"terminate process id:{p.proc.pid}")
 
-    # wait all process terminiated
+    # wait all process terminated
     time.sleep(3)
     for step in range(0, 50):
         alive = False
         for p in procs:
-            if p.proc.poll() is None:  # not termniate
+            if p.proc.poll() is None:  # not terminate
                 os.kill(p.proc.pid, signal.SIGKILL)
                 alive = True
 
@@ -414,7 +414,7 @@ def __free_port():
         step += 1
         if step > 400:
             print(
-                "can't find avilable port and use the specified static port now!"
+                "can't find available port and use the specified static port now!"
             )
             return None
 
@@ -705,7 +705,7 @@ def get_gpus(gpus):
                 for x in gpus.split(',')
             ]
             logger.info(
-                f"Change selected_gpus into reletive values. --ips:{gpus} "
+                f"Change selected_gpus into relative values. --ips:{gpus} "
                 f"will change into relative_ips:{res_gpus} according to your "
                 f"CUDA_VISIBLE_DEVICES:{cuda_visible_devices_list}"
             )
@@ -736,7 +736,7 @@ def get_xpus(xpus):
                 for x in xpus.split(',')
             ]
             logger.info(
-                f"Change selected_xpus into reletive values. --ips:{xpus} "
+                f"Change selected_xpus into relative values. --ips:{xpus} "
                 f"will change into relative_ips:{res_xpus} according to your "
                 f"XPU_VISIBLE_DEVICES:{xpu_visible_devices_list}"
             )
@@ -859,9 +859,9 @@ def get_custom_endpoints(origin_endpoints, offset=0):
 #    assert paddle_pserver_endpoints != None
 #
 #    # hard code for paddlecloud custom-framework
-#    avilable_ports = os.getenv("TRAINER_PORTS", "").split(",")
+#    available_ports = os.getenv("TRAINER_PORTS", "").split(",")
 #    assert len(
-#        avilable_ports
+#        available_ports
 #    ) >= 2, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit"
 #
 #    # hard code for paddlecloud custom-framework
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index 4924d523ded05..36833fd7b5a97 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -73,7 +73,7 @@ def _unscale(self, optimizer):
         if not self._use_dp_mode:
             self._found_inf = paddle.cast(self._found_inf, dtype="int32")
             # TODO(shenliang03) Since the minimize call in the optimizer is
-            # after the gradscaler, check_finite needs to synchronize global
+            # after the grad scaler, check_finite needs to synchronize global
             # information. In the future, we should use check_group
             paddle.distributed.all_reduce(
                 self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 05f2a4f2a28d6..852e7ced16e4a 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -103,7 +103,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
             - 1: sync_calc
             - 2: reduce_sum_sharding (allreduce --> reduce)
             - 3: sync_comm
-            - 4: allreuce_sum_dp (dp_grads)
+            - 4: allreduce_sum_dp (dp_grads)
             - 5: sync_comm (dp_grads)
             - 6: op that use Var (dp_grads & sum)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
index 2ff259be18b79..1c10efb340618 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -32,7 +32,7 @@ def prune_weight_decay(self, block, shard):
                 continue
             if OP_ROLE_VAR_KEY not in op.attr_names:
                 raise ValueError(
-                    "The Weight Dacay op should hold op_role_var attribute"
+                    "The Weight Decay op should hold op_role_var attribute"
                     f"but the {op.type} op does not hold op_role_var"
                 )
             op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 5d2f561ca974d..298e84ace66f1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -217,7 +217,7 @@ def _get_hybrid_dp_mode(self):
         # pipeline: communication across nodes, and therefore should insert in update segment,
         #           conduct just once per global step.
         dp_mode = None
-        # dp here is the pure dp as the outest parallelism
+        # dp here is the pure dp as the outermost parallelism
         if self.hybrid_dp:
             if self.pp_degree > 1:
                 dp_mode = "pp_hybrid_dp"
@@ -598,8 +598,8 @@ def _adapt_amp_clip_without_sharding(self):
         rings = [self.mp_ring_id, self.pp_ring_id]
         FP16Utils.sync_amp_check_nan_inf(main_block, rings)
 
-        gradientclip_helper = GradientClipHelper(None)
-        gradientclip_helper.sync_global_norm(
+        gradient_clip_helper = GradientClipHelper(None)
+        gradient_clip_helper.sync_global_norm(
             main_block, [self.mp_ring_id, self.pp_ring_id], self.mp_rank
         )
 
@@ -996,8 +996,8 @@ def _prune_main_program(self, block, shard, rings):
         4. prune optimizer op + param + gradient
 
         """
-        weightdecay_helper = WeightDecayHelper()
-        weightdecay_helper.prune_weight_decay(block, shard)
+        weight_decay_helper = WeightDecayHelper()
+        weight_decay_helper.prune_weight_decay(block, shard)
 
         # FIXME(wangxi): mp should prune duplicated param_grads
         # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism
@@ -1006,8 +1006,8 @@ def _prune_main_program(self, block, shard, rings):
         FP16Utils.prune_fp16(block, shard, self._reduced_grads_to_param, rings)
 
         # clipbyglobalnorm should only use the Model parallelism group (mp-sharding-pp)
-        gradientclip_helper = GradientClipHelper(None)
-        gradientclip_helper.prune_gradient_clip(block, shard, rings)
+        gradient_clip_helper = GradientClipHelper(None)
+        gradient_clip_helper.prune_gradient_clip(block, shard, rings)
 
         # build prog deps
         reduced_grads = []
@@ -1645,7 +1645,7 @@ def _build_groups(self):
 
         # global group
         # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm
-        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
+        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be divided by dp_degree
         self.global_ring_id = 3
 
         logger.info(f"global word size: {self.global_word_size}")
@@ -1727,7 +1727,7 @@ def recreate_not_persist_param_as_var(program):
 
     def _initialization_broadcast(self):
         """
-        this funtion is to ensure the initialization between dp group to be
+        this function is to ensure the initialization between dp group to be
         identical when hybrid-dp is used, and the initialization of
         not distributed param between mp group to be identical.
         """
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 21e5dbfbffefc..384d89b4d9c12 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -170,7 +170,7 @@ def __init__(self, layers, hcg, strategy):
             'accumulate_steps'
         ]
         # If sent tensor are not the same from different hosts,
-        # they shouldn't been sent partially and then concated as a whole tensor.
+        # they shouldn't been sent partially and then concatenated as a whole tensor.
         self._enable_partial_send_recv = self._strategy.pipeline_configs[
             'enable_partial_send_recv'
         ]
@@ -640,7 +640,7 @@ def _prepare_training(self, data, optimizer, lr_scheduler):
 
     def _wrap_data(self, data):
         """
-        for backward compatibilty, wrap data to Fake FakeMicroDataset if it is of type list or tuple
+        for backward compatibility, wrap data to Fake FakeMicroDataset if it is of type list or tuple
         """
         if (not isinstance(data, tuple)) and (not isinstance(data, list)):
             return data
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 2a691c2c4d4fc..046143c79842f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -342,6 +342,6 @@ def cvt_to_device(x, dev_id, blocking=True):
         place = paddle.XPUPlace(dev_id)
     else:
         raise OSError(
-            "Only supported compiled paddle with gpu/rocm and xpu , but current verison is compiled with cpu."
+            "Only supported compiled paddle with gpu/rocm and xpu, but current version is compiled with cpu."
         )
     return x._copy_to(place, blocking)
diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
index 8cfa7fbec353d..b59f304d69a42 100644
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -93,7 +93,7 @@ def check_recompute_necessary(inputs):
 
 
 @contextlib.contextmanager
-def swith_rng_state_tracker(rng_state, tracker):
+def switch_rng_state_tracker(rng_state, tracker):
     orig_rng_state = paddle.get_rng_state()
     orig_rng_tracker = get_rng_state_tracker().get_states_tracker()
     paddle.set_rng_state(rng_state)
@@ -155,8 +155,8 @@ def forward(ctx, run_function, preserve_rng_state, *args, **kwargs):
                 ctx.inputs.append(arg)
         ctx.save_for_backward(*tensor_inputs)
 
-        # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
-        # one process with multiple gpu and mix-gpu-cpu senarios are not support
+        # NOTE recompute with restore RNG only support one scenario where one process for one cuda gpu.
+        # one process with multiple gpu and mix-gpu-cpu scenarios are not support
         if ctx.preserve_rng_state:
             ctx.fw_rng_state = paddle.get_rng_state()
             ctx.fwd_rng_state_tracker = (
@@ -208,7 +208,7 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state_tracker(
+                with switch_rng_state_tracker(
                     ctx.fw_rng_state, ctx.fwd_rng_state_tracker
                 ):
                     with paddle.amp.auto_cast(
@@ -273,7 +273,7 @@ def backward(ctx, *args):
                         # all tensors in the tuple doesn't need grad, only return a None for the whole tuple
                         grads.append(None)
                     else:
-                        # all tensors in the tuple nees grad, should return a tuple of grads
+                        # all tensors in the tuple need grad, should return a tuple of grads
                         grads.append(tuple(i._grad_ivar() for i in inp))
 
             if in_dynamic_mode():
@@ -303,7 +303,7 @@ def _recompute_without_reentrant(
             fw_cuda_rng_state = paddle.get_rng_state(cur_device)
         else:
             raise RuntimeError(
-                "Recompute with RNG perserve is not support current device: {}.".format(
+                "Recompute with RNG preserve is not support current device: {}.".format(
                     cur_device
                 )
             )
@@ -358,10 +358,10 @@ def inner_pack(inner_x):
                 return
 
             def inner_unpack(inner_x):
-                raise Exception("An unexcepted backward called on a tensor!")
+                raise Exception("An unexpected backward called on a tensor!")
 
             if preserve_rng_state:
-                with swith_rng_state_tracker(
+                with switch_rng_state_tracker(
                     fw_cuda_rng_state, fwd_cuda_rng_state_tracker
                 ):
                     with paddle.set_grad_enabled(True):
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
index 789f0cac73d94..29e7c73459854 100644
--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -22,7 +22,7 @@
 from .recompute import (
     check_recompute_necessary,
     detach_variable,
-    swith_rng_state_tracker,
+    switch_rng_state_tracker,
 )
 
 __all__ = []
@@ -198,7 +198,7 @@ def backward(ctx, *args):
             tracer._has_grad = True
 
             # need restore auto_cast state as well as w/b list
-            with swith_rng_state_tracker(
+            with switch_rng_state_tracker(
                 ctx.fwd_rng_state, ctx.fwd_rng_state_tracker
             ):
                 if ctx.is_fw_autocast:
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 865571cfeca6f..f69470397e1d9 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -43,7 +43,7 @@ def _set_basic_info(self, context):
         self.origin_main_program = context["origin_main_program"]
         self.origin_startup_program = context["origin_startup_program"]
         self.async_strategy = self._get_distributed_strategy()
-        self.compiled_strategy = self.build_compiled_startegy()
+        self.compiled_strategy = self.build_compiled_strategy()
 
     def _get_distributed_strategy(self):
         strategy = None
@@ -69,7 +69,7 @@ def _get_distributed_strategy(self):
 
         return strategy
 
-    def build_compiled_startegy(self):
+    def build_compiled_strategy(self):
         from paddle.incubate.distributed.fleet.parameter_server.ir.public import (
             CompileTimeStrategy,
         )
@@ -203,7 +203,7 @@ def get_sparse_attrs():
 
                 if len(dist_varnames) != 0:
                     raise ValueError(
-                        "GeoStrategy can not support large scale embeding now, please use paddle.static.nn.embedding"
+                        "GeoStrategy can not support large scale embedding now, please use paddle.static.nn.embedding"
                     )
 
                 init_attrs = []
@@ -354,11 +354,11 @@ def _init_server(self, *args, **kwargs):
         sparse_related_optimize_varnames = list(
             set(sparse_related_optimize_varnames)
         )
-        distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
+        distributed_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
             True
         )
         distributed_related_optimize_varnames = []
-        for var_name in distribtued_varnames:
+        for var_name in distributed_varnames:
             distributed_related_optimize_varnames += (
                 self.compiled_strategy.get_optimize_varname_on_ps(var_name)
             )
@@ -370,7 +370,7 @@ def _init_server(self, *args, **kwargs):
             filter(
                 ParameterServerRuntime.__exclude_vars(
                     sparse_varnames
-                    + distribtued_varnames
+                    + distributed_varnames
                     + sparse_related_optimize_varnames
                     + distributed_related_optimize_varnames
                 ),
@@ -402,7 +402,7 @@ def _init_server(self, *args, **kwargs):
         # load large scale
         self._load_distributed_params(
             dirname=model_dirname,
-            varnames=distribtued_varnames
+            varnames=distributed_varnames
             + distributed_related_optimize_varnames,
         )
 
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index a14c337a4fad1..94d403765b1a0 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -684,7 +684,7 @@ def _set_basic_info(self, context):
         self.origin_main_program = context["origin_main_program"]
         self.origin_startup_program = context["origin_startup_program"]
         self.async_strategy = self._get_distributed_strategy()
-        self.compiled_strategy = self.build_compiled_startegy()
+        self.compiled_strategy = self.build_compiled_strategy()
 
     def _get_distributed_strategy(self):
         strategy = None
@@ -712,7 +712,7 @@ def _get_distributed_strategy(self):
             strategy.use_ps_gpu = True
         return strategy
 
-    def build_compiled_startegy(self):
+    def build_compiled_strategy(self):
         from paddle.incubate.distributed.fleet.parameter_server.ir.public import (
             CompileTimeStrategy,
         )
@@ -1125,8 +1125,8 @@ def _get_tables():
             if len(tensor_table_dict) > 0:
                 tables = _add_tensor_table(tables)
             else:
-                empty_porgram = Program()
-                self._server_sub_program.append(empty_porgram.desc)
+                empty_program = Program()
+                self._server_sub_program.append(empty_program.desc)
 
             barrier_table = _build_barrier_table(len(tables))
             tables.append(barrier_table)
diff --git a/python/paddle/distributed/fleet/scaler.py b/python/paddle/distributed/fleet/scaler.py
index 40e182e7f2e40..977b336eb31bb 100755
--- a/python/paddle/distributed/fleet/scaler.py
+++ b/python/paddle/distributed/fleet/scaler.py
@@ -139,7 +139,7 @@ def unscale_method(self, optimizer):
         self._found_inf = self._found_inf.cast("int32")
 
         # TODO(shenliang03) Since dp allreduce in the optimizer is
-        # after the gradscaler, check_finite needs to synchronize global
+        # after the grad scaler, check_finite needs to synchronize global
         # information. In the future, we should use check_group to speed.
         paddle.distributed.all_reduce(
             self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index aa7ec2e544efe..5c2ec7fece24d 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -994,7 +994,7 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
             fs_src_path(str):  Name of the file or directory, that's needed to be moved.
             fs_dst_path(str):  Name of the file or directory to which to move to.
             overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Exception.
 
         Examples:
 
@@ -1177,7 +1177,7 @@ def _split_files(self, files, trainer_id, trainers):
             trainer_id(int): trainer mpi rank id
             trainers(int): all trainers num
         Returns:
-            fileist(list): file list of current trainer
+            filelist(list): file list of current trainer
         """
         remainder = len(files) % trainers
         blocksize = len(files) // trainers
@@ -1200,7 +1200,7 @@ def list_files_info(self, path_list):
         Args:
             path_list(list): file list
         Returns:
-            fileist(list): file list with file path and size
+            filelist(list): file list with file path and size
         """
         if len(path_list) <= 0:
             return []
@@ -1650,7 +1650,7 @@ def _split_files(self, files, trainer_id, trainers):
             trainer_id(int): trainer mpi rank id
             trainers(int): all trainers num
         Returns:
-            fileist(list): file list of current trainer
+            filelist(list): file list of current trainer
         """
         remainder = len(files) % trainers
         blocksize = len(files) // trainers
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index d8142b7081f2b..38e6eeca008d6 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -63,7 +63,7 @@ class HybridParallelInferenceHelper:
             ...     with paddle.base.device_guard(f'{device}:all'):
             ...         # read data from global lod_tensor_array
             ...         element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx)
-            ...         # write placehold data to global lod_tensor_array,
+            ...         # write placeholder data to global lod_tensor_array,
             ...         # it need for send_v2 of lod_tensor_array
             ...         paddle.increment(x=step_idx, value=1.0)
             ...         paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr)
@@ -455,7 +455,7 @@ def _find_prev_op(self, index, var_name):
 
     def _add_op_device_attr(self, block):
         """
-        Add op_device attrribute for ops in block that have
+        Add op_device attribute for ops in block that have
         not that attribute set.
 
         Args:
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index fc0f897b1454c..27aa4c9f54074 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -263,7 +263,7 @@ def fused_allreduce_gradients(parameter_list, hcg):
 
 
 def broadcast_sharding_parameters(model, hcg):
-    # TODO TO save memory, use un-fused broadcast to avoid potentional OOM
+    # TODO TO save memory, use un-fused broadcast to avoid potential OOM
     logger.debug("sharding start init parameters sync")
     sharding_parallel_group = hcg.get_sharding_parallel_group()
     src_rank = hcg.get_sharding_parallel_group_src_rank()
@@ -273,7 +273,7 @@ def broadcast_sharding_parameters(model, hcg):
 
 
 def broadcast_sep_parameters(model, hcg):
-    # TODO TO save memory, use un-fused broadcast to avoid potentional OOM
+    # TODO TO save memory, use un-fused broadcast to avoid potential OOM
     logger.debug("sep start init parameters sync")
     sep_group = hcg.get_sep_parallel_group()
     src_rank = hcg.get_sep_parallel_group_src_rank()
diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
index 7b4ff7a0410e5..bbc632029a59b 100644
--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -47,7 +47,7 @@ def __init__(self, layers, dtype="float16"):
                 param._register_grad_hook(self._update_main_grad_hook(param))
 
     def _update_main_grad_hook(self, param):
-        """Create the update_main_grad hook for backprop."""
+        """Create the update_main_grad hook for back-prop."""
 
         # Hook used for back-prop and grad-merge.
         @paddle.autograd.no_grad()
diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
index 959f9eb49f40f..dff62c1a22db1 100644
--- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
+++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
@@ -783,7 +783,7 @@ def fused_parameters(
     :param fuse_param: fuse param or not
     :param scale_after_comm: if enable comm overlap, specify the location of grad scale
     :param group_params: the format of the input parameters is param group
-    :param apply_decay_param_fun: the funtion to filter decay param
+    :param apply_decay_param_fun: the function to filter decay param
     :return: param storage if fused, comm buffers if comm overlap, param groups if use group params
     """
     if act is None:
diff --git a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
index 9ca0a7fdfc89f..88cb6ff27b1aa 100644
--- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
@@ -44,7 +44,7 @@ def tensor_parallel_sync_filter_fn(
     param, pos_emb=True, layer_norm=True, bias=True
 ):
     """
-    Layer fliter function for tensor parallelism transformer.
+    Layer filter function for tensor parallelism transformer.
 
     In tensor parallelism of transformer like model, there is 4 kind of param
     that are supposed to be the same in all tensor parallel peers:
@@ -111,7 +111,7 @@ def copy_parameters(block_, params):
         )
         assert (
             param.is_distributed is False
-        ), f"Try to sync Distribted Parameter: {param}"
+        ), f"Try to sync Distributed Parameter: {param}"
         new_p.is_distributed = False
 
     block_.vars[new_p.name] = new_p
@@ -291,7 +291,7 @@ def add_extra_synchronization(
 
     sync_mode(string): select from
         "broadcast": parameter is sync by broadcasted from 'src_rank' to all other ranks.
-        "average": paramter is sync by average amonge all ranks
+        "average": parameter is sync by average among all ranks
 
     src_rank(int): the src used in broadcast sync_mode.
 
@@ -324,7 +324,7 @@ def add_extra_synchronization(
         if params_filter_fn(param):
             params_to_sync.append(param)
     logger.info(
-        "The following param are goning to be synchronization everytime the optimizer update phase of the program is runned: "
+        "The following param are going to be synchronization everytime the optimizer update phase of the program is runned: "
     )
     logger.info([p.name for p in params_to_sync])
 
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index 0c326c91f5cc6..3bee69f5d7deb 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -91,7 +91,7 @@ def get_logger(self, level=logging.INFO):
         logger.addHandler(ch)
         return logger
 
-    def continous_log(self) -> bool:
+    def continuous_log(self) -> bool:
         if self.args.log_level.upper() in ['DEBUG', 'ERROR']:
             return True
         else:
@@ -102,6 +102,6 @@ def set_env_in_args(self):
             attr, attr_type = v
             if k in self.envs:
                 print(
-                    f"LAUNCH WARNNING args {attr} will be overridden by env: {k} value: {self.envs[k]}"
+                    f"LAUNCH WARNING args {attr} will be overridden by env: {k} value: {self.envs[k]}"
                 )
                 setattr(self.args, attr, attr_type(self.envs[k]))
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 4553ea1bb776b..e6eae1a94e3f6 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -95,7 +95,7 @@ def watch(self) -> bool:
         while not self.ctx.status.is_done():
             status = self.pod.watch(timeout=2)
 
-            # if self.ctx.continous_log():
+            # if self.ctx.continuous_log():
             # default to print log
             self.pod.logs()
 
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 5a2009b2fd0f2..4ebda8bc64c25 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -59,7 +59,7 @@ def _number_count(numbers, upper_range):
 def _assign_pos(x, cum_count):
     """
     Assign pos decides which tokens should be fetched belong to
-    specially expert orderingly.
+    specially expert orderly.
 
     Args:
         x (Tensor): Tensor. Every element in the list must be a Tensor whose data type
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index f2b2c140cd6cf..c820a3d882274 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -63,7 +63,7 @@ class DataParallelOptimizationPass(PassBase):
 
     def __init__(self):
         super().__init__()
-        # NOTE not use depence on loss and param_grads
+        # NOTE not use dependence on loss and param_grads
         self.set_attr("dist_context", None)
         self.set_attr("global_rank", -1)
         self.set_attr("use_sharding", False)
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 9fe72c8aabd75..822bdbd6801b2 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -293,7 +293,7 @@ def _check_self(self):
     def _check_conflict(self, other_pass):
         return True
 
-    def get_ops_per_device(self, ops, all_ops_process_meshs, sr=0):
+    def get_ops_per_device(self, ops, all_ops_process_meshes, sr=0):
         """
         Get ops and op_names of each process mesh excluding ops within the first "sr" chunks
         """
@@ -302,7 +302,7 @@ def reset_recompute_op(op):
             if is_recompute_op(op) or is_recompute_exclude_op(op):
                 op._set_attr("op_namescope", "")
 
-        all_process_meshes_count = len(all_ops_process_meshs)
+        all_process_meshes_count = len(all_ops_process_meshes)
         ops_of_stages = [[] for _ in range(all_process_meshes_count)]
         op_names_of_stages = [[] for _ in range(all_process_meshes_count)]
         pushed_ops_count = 0
@@ -321,7 +321,7 @@ def reset_recompute_op(op):
             if chunk_id // all_process_meshes_count < sr:
                 continue
 
-            for id, process_mesh in enumerate(all_ops_process_meshs):
+            for id, process_mesh in enumerate(all_ops_process_meshes):
                 if op.dist_attr.process_mesh == process_mesh:
                     pushed_ops_count += 1
                     ops_of_stages[id].append(op)
@@ -346,15 +346,15 @@ def _apply_single_impl(self, main_program, startup_program, context):
         op_path = _find_op_path(main_program, loss, no_grad_set)
 
         # 1. mark exclude ops for refined-recompute according to ops-patterns(mainly linear and flash_attn)
-        # 1.1 get all process_meshs in op_path
-        all_ops_process_meshs = []
+        # 1.1 get all process_meshes in op_path
+        all_ops_process_meshes = []
         for op in op_path:
-            if op.dist_attr.process_mesh not in all_ops_process_meshs:
-                all_ops_process_meshs.append(op.dist_attr.process_mesh)
+            if op.dist_attr.process_mesh not in all_ops_process_meshes:
+                all_ops_process_meshes.append(op.dist_attr.process_mesh)
 
         # 1.2 get ops_devices and op_names_devices
         ops_devices, op_names_devices = self.get_ops_per_device(
-            op_path, all_ops_process_meshs, self._sr
+            op_path, all_ops_process_meshes, self._sr
         )
         all_ops_len = len(op_path)
         all_exclude_ops_ids = [[] for _ in op_names_devices]
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index adddb37d26b43..617425158dd89 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -1187,7 +1187,7 @@ def _overlap_grad_comm(
             2.2 insert after communication dependencies only when need
         3. there is not need to add explicit dependencies for non-coalesce gradient communication
 
-        P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream awared allocator.
+        P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream award allocator.
         """
 
         if not self.enable_overlap:
@@ -1309,7 +1309,7 @@ def _overlap_grad_comm(
         # hierarchical grad comm
         if self.enable_hierarchical_comm:
             # NOTE so far we only support Isomorphic cluster with 8 ranks per node
-            # TODO unifiy here create communicators
+            # TODO unify here create communicators
             # create communicators
             nranks_per_node = 8
             assert self.sharding_world_size % nranks_per_node == 0
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 113f5275d8e7b..eb3e0368c49a8 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -890,8 +890,8 @@ def _create_heter_program(
         #         joint_var.0_1 -> slice -> reshape -> origin_var
         #         origin_var -> origin_program
         #         reshape -> concat -> joint_var.1_2
-        #     d) copy send op from origin program for var@grad which loacted in current heter block
-        #     e) re-check every op in current blcok if its device is not current heter devie
+        #     d) copy send op from origin program for var@grad which located in current heter block
+        #     e) re-check every op in current block if its device is not current heter device
         # 2. Create send op for step counter in last heter-block
         # 3. Create Listen&Serv OP and Send&Recv OP for distributed training
         # 4. update CompileTimeStrategy for heter_program
diff --git a/python/paddle/distributed/ps/utils/collective_transpiler.py b/python/paddle/distributed/ps/utils/collective_transpiler.py
index 7f398842fd701..8d0ff9a53e551 100644
--- a/python/paddle/distributed/ps/utils/collective_transpiler.py
+++ b/python/paddle/distributed/ps/utils/collective_transpiler.py
@@ -357,7 +357,7 @@ def _insert_allreduce_ops(self):
                         )
                         offset += 1
 
-                    # As we search ops reversedly, we should insert c_allreduce_sum
+                    # As we search ops reversely, we should insert c_allreduce_sum
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
                     block._insert_op(
@@ -631,7 +631,7 @@ def _insert_allgather_ops(self):
                         )
                         offset += 1
 
-                    # As we search ops reversedly, we should insert c_allgather
+                    # As we search ops reversely, we should insert c_allgather
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
                     block._insert_op(