Skip to content

Commit

Permalink
fix data type logging (pytorch#53162)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#53162

it is possible there are multiple data types in mixed precision training, so log data types as a list of data type names.
ghstack-source-id: 123452626

Test Plan: unit test

Reviewed By: SciPioneer

Differential Revision: D26769256

fbshipit-source-id: 8f7d73821e89864fedbbce723f301fe8fbad5685
  • Loading branch information
zhaojuanmao authored and facebook-github-bot committed Mar 10, 2021
1 parent 7d4b229 commit d032287
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 9 deletions.
12 changes: 9 additions & 3 deletions c10/util/Logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,8 +345,9 @@ struct DDPLoggingData {
std::vector<int> device_ids = std::vector<int>();
int output_device = -1;
std::string backend_name = "";
// Parameter's data type
std::string dtype = "";
// Parameters' data types, there may be multiple data
// types for mixed precision training.
std::vector<std::string> dtypes = std::vector<std::string>();
// Total parameters size (Bytes)
int64_t total_parameter_size_bytes = -1;
// The number of parameter tensors
Expand Down Expand Up @@ -431,12 +432,17 @@ struct DDPLoggingData {

std::string devicesStr = toString(deviceIdsStream, ddp_logging_data.device_ids);
std::string bucketSizesStr = toString(bucketSizesStream, ddp_logging_data.bucket_sizes);
std::string dtypesStr;
for (const auto & dtype : ddp_logging_data.dtypes) {
dtypesStr += dtype;
dtypesStr += " ";
}

std::string ddpLoggingDataInfo = c10::str(
"world_size: ", ddp_logging_data.world_size, ", module_name: ",
ddp_logging_data.module_name, ", device_ids: ", devicesStr, ", output_device: ",
ddp_logging_data.output_device, ", backend_name: ", ddp_logging_data.backend_name,
", parameter_dtype: ", ddp_logging_data.dtype, ", total_parameter_size_in_bytes: ",
", parameter_dtype: ", dtypesStr, ", total_parameter_size_in_bytes: ",
ddp_logging_data.total_parameter_size_bytes, ", num_parameter_tensors: ",
ddp_logging_data.num_parameter_tensors, " bucket_sizes: ", bucketSizesStr,
", CUDA_VISIBLE_DEVICES: ", ddp_logging_data.cuda_visible_devices, ", broadcast_buffers: ",
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/distributed/c10d/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1301,12 +1301,12 @@ py::class_<c10::DDPLoggingData>(module, "DDPLoggingData")
&c10::DDPLoggingData::gradient_as_bucket_view)
.def_readwrite("backend_name", &c10::DDPLoggingData::backend_name)
.def_readwrite("iteration", &c10::DDPLoggingData::iteration)
.def_readwrite("dtype", &c10::DDPLoggingData::dtype)
.def_readwrite(
"total_parameter_size_bytes",
&c10::DDPLoggingData::total_parameter_size_bytes)
.def_readwrite(
"num_parameter_tensors", &c10::DDPLoggingData::num_parameter_tensors)
.def_readwrite("dtypes", &c10::DDPLoggingData::dtypes)
.def_readwrite("bucket_sizes", &c10::DDPLoggingData::bucket_sizes)
.def_readwrite("master_port", &c10::DDPLoggingData::master_port)
.def_readwrite("master_addr", &c10::DDPLoggingData::master_addr)
Expand Down
9 changes: 6 additions & 3 deletions torch/lib/c10d/logger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,14 @@ void Logger::set_env_variables() {
void Logger::set_parameter_stats() {
ddp_logging_data_->num_parameter_tensors = reducer_->replicas_[0].size();
ddp_logging_data_->total_parameter_size_bytes = 0;
for (const auto& t : reducer_->replicas_[0]) {
std::set<std::string> unique_dtypes;
for (auto t : reducer_->replicas_[0]) {
ddp_logging_data_->total_parameter_size_bytes +=
t.numel() * t.element_size();
unique_dtypes.insert(std::string(t.dtype().name()));
}
for (auto dtype : unique_dtypes) {
ddp_logging_data_->dtypes.push_back(dtype);
}
}

Expand Down Expand Up @@ -100,8 +105,6 @@ void Logger::set_construction_data_and_log(
ddp_logging_data_->world_size = reducer_->process_group_->getSize();
ddp_logging_data_->rank = reducer_->process_group_->getRank();
ddp_logging_data_->iteration = 0;
ddp_logging_data_->dtype =
std::string(reducer_->replicas_[0][0].dtype().name());
ddp_logging_data_->is_multi_device_module = reducer_->is_multi_device_module_;

set_parameter_stats();
Expand Down
7 changes: 5 additions & 2 deletions torch/testing/_internal/distributed/distributed_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3637,7 +3637,7 @@ def parse_env(var):
for p in params:
num_params += 1
param_size += p.numel() * p.element_size()
self.assertEqual(ddp_logging_data.dtype, "float")
self.assertEqual(ddp_logging_data.dtypes, ["float"])
self.assertEqual(ddp_logging_data.total_parameter_size_bytes, param_size)
self.assertEqual(ddp_logging_data.num_parameter_tensors, num_params)
self.assertEqual(ddp_logging_data.bucket_sizes, [param_size])
Expand Down Expand Up @@ -3666,15 +3666,18 @@ def parse_env(var):
self.assertGreaterEqual(
ddp_logging_data.avg_backward_comm_time,
ddp_logging_data.avg_backward_compute_comm_overlap_time)
# test larger net and verify multiple bucket sizes
# test larger net with mixed data types, verify multiple bucket sizes
model = LargeNet()
model.float()
model.fc1.double()
model_DDP = nn.parallel.DistributedDataParallel(model, bucket_cap_mb=1.5)
ddp_logging_data = model_DDP.get_ddp_logging_data()
params = list(model_DDP.parameters())
self.assertEqual(ddp_logging_data.bucket_cap_mb, 1.5)
self.assertEqual(
ddp_logging_data.bucket_sizes,
[params[1].numel() * params[1].element_size(), params[0].numel() * params[0].element_size()])
self.assertEqual(','.join(ddp_logging_data.dtypes), 'double,float')

@unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
"Only Nccl & Gloo backend support DistributedDataParallel")
Expand Down

0 comments on commit d032287

Please sign in to comment.