PaddlePaddle · heavyrain-lzy · Feb 19, 2024 · Feb 18, 2024
diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h
@@ -83,10 +83,16 @@ inline phi::DataType GetDtypeWithPlace(
   for (const auto& tensors : amp_tensors_vector) {
     for (const auto& tensor : tensors) {
       auto place = tensor.place();
-      is_right_place = (paddle::platform::is_gpu_place(place) ||
-                        paddle::platform::is_cuda_pinned_place(place) ||
-                        paddle::platform::is_xpu_place(place) ||
-                        paddle::platform::is_custom_place(place));
+      // TODO(lizhiyu): If the tensor is a dist-tensor, it's place may be
+      // `unknown` in the no-calculation rank right now.
+      //       We use `is_dist_tensor()` to avoid the bug temporarily. The
+      //       dist-tensor in the no-calculation rank should have the right
+      //       place.
+      is_right_place =
+          (tensor.is_dist_tensor() || paddle::platform::is_gpu_place(place) ||
+           paddle::platform::is_cuda_pinned_place(place) ||
+           paddle::platform::is_xpu_place(place) ||
+           paddle::platform::is_custom_place(place));
       if (is_right_place) {
         break;
       }

diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
@@ -777,9 +777,11 @@ std::string EagerUtils::TensorStr(const paddle::Tensor& t) {
                 "%s, Local Shape: %s", t.dims(), dist_t->local_dims()),
             dist_t->dist_attr());
       } else {
+        // NOTE: If the tensor is a dist-tensor, it's place may be `unknown` in
+        // the no-calculation rank.
         tensor_info_str += paddle::string::Sprintf(DIST_TENSOR_INFO_TEMPLATE,
                                                    t.impl()->type_info().name(),
-                                                   "Unknown",
+                                                   t.dtype(),
                                                    "Unknown",
                                                    dist_t->defined(),
                                                    dist_t->initialized(),

diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
@@ -473,6 +473,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              return common::DataLayoutToString(self.layout());
            })
       .def("_share_data_with", &phi::DenseTensor::ShareDataWith)
+      .def("_share_data_nocheck_with", &phi::DenseTensor::ShareDataNoCheckWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
       .def("__str__",
            [](const phi::DenseTensor &self) {
@@ -1065,6 +1066,13 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              self.unsafe_mutable_value()->ShareDataWith(src.value());
              return self;
            })
+      .def("_share_data_nocheck_with",
+           [](DistTensor &self, const DistTensor &src) {
+             self.unsafe_set_dims(src.dims());
+             self.unsafe_set_dist_attr(src.dist_attr());
+             self.unsafe_mutable_value()->ShareDataNoCheckWith(src.value());
+             return self;
+           })
       .def("_share_data_with", [](DistTensor &self, const DistTensor &src) {
         self.unsafe_set_dims(src.dims());
         self.unsafe_set_dist_attr(src.dist_attr());

diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
@@ -88,6 +88,9 @@ InplaceVersion& InplaceVersionCounter() { return *inplace_version_counter_; }
 /*! The internal of two tensors share the same memory block. */
 DenseTensor& ShareDataWith(const DenseTensor& src);
 
+/*! The internal of two tensors share the same memory block without checking the memory size for dist-tensor. */
+DenseTensor& ShareDataNoCheckWith(const DenseTensor& src);
+
 /*! The internal of two tensors share the same inplace version counter. */
 DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src);
 

diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
@@ -420,6 +420,14 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
   return *this;
 }
 
+DenseTensor& DenseTensor::ShareDataNoCheckWith(const DenseTensor& src) {
+  holder_ = src.holder_;
+  set_meta(src.meta());
+  storage_properties_ =
+      std::move(CopyStorageProperties(src.storage_properties_));
+  return *this;
+}
+
 DenseTensor& DenseTensor::ShareInplaceVersionCounterWith(
     const DenseTensor& src) {
   PADDLE_ENFORCE_NOT_NULL(

diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -114,6 +114,9 @@ Placements ToPlacements(const TensorDistAttr& dist_attr) {
 
 DistTensor::DistTensor() : value_(std::make_shared<DenseTensor>()) {}
 
+DistTensor::DistTensor(phi::DataType dtype)
+    : value_(std::make_shared<DenseTensor>(dtype)) {}
+
 DistTensor::DistTensor(const std::shared_ptr<phi::DenseTensor>& global_value,
                        const TensorDistAttr& dist_attr)
     : global_dims_(global_value->dims()), dist_attr_(dist_attr) {

diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -45,6 +45,10 @@ class DistTensor final
   /// will be set by reshard later.
   DistTensor();
 
+  /// \brief Construct a dist tensor based dtype.
+  /// \param dtype The dtype of the current tensor.
+  explicit DistTensor(phi::DataType dtype);
+
   /// \brief Construct a dist tensor based dense tensor.
   /// \param global_value The global dense tensor of the current tensor.
   /// \param dist_attr The distributed attributes of the current tensor.

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
@@ -290,7 +290,9 @@ void CrossNdMeshReshardFunction::Eval(DeviceContext* dev_ctx,
   VLOG(3) << "Call CrossNdMeshReshardFunction Eval";
   const auto& in_dist_attr = in.dist_attr();
 
-  DistTensor tmp_result;
+  // Construct a `DistTensor` by `dtype` of `in` tensor to avoid using default
+  // dtype `float32`. The default dtype `float32` may cause error in amp.
+  DistTensor tmp_result(in.dtype());
   TensorDistAttr in_dist_attr_shard = in_dist_attr;
   in_dist_attr_shard.set_partial_status(out_dist_attr.partial_status());
   in_dist_attr_shard.set_dims_mapping(out_dist_attr.dims_mapping());

diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
@@ -2264,7 +2264,11 @@ def _transform(self, t, device, dtype, blocking):
         # 4. share Tensor to origin param / Tensor
         dst_tensor = t.value().get_tensor()
         src_tensor = new_t.value().get_tensor()
-        dst_tensor._share_data_with(src_tensor)
+        if t._is_initialized():
+            dst_tensor._share_data_with(src_tensor)
+        else:
+            # If the tensor is not initialized, we can't check the memory size.
+            dst_tensor._share_data_nocheck_with(src_tensor)
 
         return t
 

diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_llama.py b/test/auto_parallel/hybrid_strategy/semi_auto_llama.py
@@ -206,8 +206,21 @@ def run_llama(self, to_static=0):
             for epoch_idx in range(1):
                 for step, inputs in enumerate(dist_loader()):
                     input_ids, labels = inputs
+                    custom_black_list = [
+                        "reduce_sum",
+                        "c_softmax_with_cross_entropy",
+                    ]
+                    custom_white_list = []
+                    if self.amp_level == "O2":
+                        custom_white_list.extend(
+                            ["lookup_table", "lookup_table_v2"]
+                        )
                     with paddle.amp.auto_cast(
-                        self.amp, level=self.amp_level, dtype=self.amp_dtype
+                        self.amp,
+                        custom_black_list=set(custom_black_list),
+                        custom_white_list=set(custom_white_list),
+                        level=self.amp_level,
+                        dtype=self.amp_dtype,
                     ):
                         logits = model(input_ids)
                         tr_loss_step = criterion(logits, labels)