Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TransformerLayer, TransformerBlock, C3TR modules #2333

Merged
merged 18 commits into from
Apr 1, 2021
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cleanup find_unused_parameters
  • Loading branch information
glenn-jocher committed Apr 1, 2021
commit d38b59b7101734d456051c680db87708a6783d07
16 changes: 7 additions & 9 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import argparse
import logging
import math
import numpy as np
import os
import random
import time
from copy import deepcopy
from pathlib import Path
from threading import Thread

import numpy as np
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.utils.data
import yaml
from copy import deepcopy
from pathlib import Path
from threading import Thread
from torch.cuda import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriter
Expand Down Expand Up @@ -218,10 +217,9 @@ def train(hyp, opt, device, tb_writer=None):

# DDP mode
if cuda and rank != -1:
# `find_unused_parameters=True` should be passed for the incompatibility of nn.MultiheadAttention with DDP,
# according to https://github.com/pytorch/pytorch/issues/26698
find_unused_params = False if not [type(layer) for layer in model.modules() if isinstance(layer, nn.MultiheadAttention)] else True
model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank, find_unused_parameters=find_unused_params)
model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank,
# nn.MultiheadAttention incompatibility with DDP https://github.com/pytorch/pytorch/issues/26698
find_unused_parameters=any(isinstance(layer, nn.MultiheadAttention) for layer in model.modules()))

# Model parameters
hyp['box'] *= 3. / nl # scale to layers
Expand Down