Skip to content

Commit

Permalink
bugfixed: stuck when training with dist_train.sh, support tcp_port (o…
Browse files Browse the repository at this point in the history
  • Loading branch information
sshaoshuai authored Feb 2, 2022
1 parent 274c90c commit 7ce6a2b
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pcdet/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,11 @@ def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')

os.environ['MASTER_PORT'] = str(tcp_port)
os.environ['MASTER_ADDR'] = 'localhost'
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(local_rank % num_gpus)

dist.init_process_group(
backend=backend,
# init_method='tcp://127.0.0.1:%d' % tcp_port,
Expand Down

0 comments on commit 7ce6a2b

Please sign in to comment.