Solve the problem of MA with DDP

ultralytics · glenn-jocher · Apr 1, 2021 · Feb 6, 2021 · Feb 11, 2021 · Feb 11, 2021
commit e6e5f0ea1704f66c883563fd0e5c7939851cd708
diff --git a/train.py b/train.py
@@ -218,7 +218,10 @@ def train(hyp, opt, device, tb_writer=None):
 
     # DDP mode
     if cuda and rank != -1:
-        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
+        # `find_unused_parameters=True` should be passed for the incompatibility of nn.MultiheadAttention with DDP,
+        # according to https://github.com/pytorch/pytorch/issues/26698
+        find_unused_params = False if not [type(layer) for layer in model.modules() if isinstance(layer, nn.MultiheadAttention)] else True
+        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank, find_unused_parameters=find_unused_params)
 
     # Model parameters
     hyp['box'] *= 3. / nl  # scale to layers