yolotr

ultralytics · glenn-jocher · Apr 1, 2021 · Feb 6, 2021 · Feb 11, 2021 · Feb 11, 2021
commit 4fa932698b6a3b5e2acc914c86a869a352f512fe
diff --git a/data/scripts/get_coco.sh b/data/scripts/get_coco.sh
@@ -11,7 +11,8 @@
 d='../' # unzip directory
 url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
 f='coco2017labels.zip'                                                                 # 68 MB
-echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove
+#echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove
+echo 'Downloading' $url$f ' ...' && wget $url$f && unzip -q $f -d $d && rm $f
 
 # Download/unzip images
 d='../coco/images' # unzip directory
@@ -20,7 +21,8 @@ f1='train2017.zip' # 19G, 118k images
 f2='val2017.zip'   # 1G, 5k images
 f3='test2017.zip'  # 7G, 41k images (optional)
 for f in $f1 $f2; do
-  echo 'Downloading' $url$f '...' && curl -L $url$f -o $f # download, (unzip, remove in background)
-  unzip -q $f -d $d && rm $f &
+  #echo 'Downloading' $url$f '...' && curl -L $url$f -o $f # download, (unzip, remove in background)
+  #echo 'Downloading' $url$f '...' && wget $url$f
+  unzip -q $f -d $d # && rm $f &
 done
 wait # finish background tasks
diff --git a/data/scripts/get_voc.sh b/data/scripts/get_voc.sh
@@ -18,7 +18,8 @@ f1=VOCtrainval_06-Nov-2007.zip # 446MB, 5012 images
 f2=VOCtest_06-Nov-2007.zip     # 438MB, 4953 images
 f3=VOCtrainval_11-May-2012.zip # 1.95GB, 17126 images
 for f in $f3 $f2 $f1; do
-  echo 'Downloading' $url$f '...' && curl -L $url$f -o $f # download, (unzip, remove in background)
+  #echo 'Downloading' $url$f '...' && curl -L $url$f -o $f # download, (unzip, remove in background)
+  echo 'Downloading' $url$f '...' && wget $url$f
   unzip -q $f -d $d && rm $f &
 done
 wait # finish background tasks

diff --git a/models/common.py b/models/common.py
@@ -40,6 +40,43 @@ def fuseforward(self, x):
         return self.act(self.conv(x))
 
 
+class Transformer(nn.Module):
+    def __init__(self, c1, c2, num_heads):
+        super(Transformer, self).__init__()
+
+        self.linear = nn.Linear(c1, c1)
+        self.ln1 = nn.LayerNorm(c1)
+        self.q = nn.Linear(c1, c1)
+        self.k = nn.Linear(c1, c1)
+        self.v = nn.Linear(c1, c1)
+        self.ma = nn.MultiheadAttention(embed_dim=c1, num_heads=num_heads)
+        self.ln2 = nn.LayerNorm(c1)
+        self.fc1 = nn.Linear(c1, c2)
+        self.fc2 = nn.Linear(c2, c2)
+        self.gelu = nn.GELU()
+        self.c2 = c2
+
+    def forward(self, x):
+        b, _, w, h = x.shape
+        p = x.flatten(2)
+        p = p.unsqueeze(0)
+        p = p.transpose(0, 3)
+        p = p.squeeze(3)
+        e = self.linear(p)
+        x = p + e
+
+        x_ = self.ln1(x)
+        x = self.ma(self.q(x_), self.k(x_), self.v(x_))[0] + x
+        x = self.ln2(x)
+        x = self.fc1(x)
+        x = self.gelu(x)
+        x = self.fc2(x)
+        x = x.unsqueeze(3)
+        x = x.transpose(0, 3)
+        x = x.reshape(b, self.c2, w, h)
+        return x
+
+
 class Bottleneck(nn.Module):
     # Standard bottleneck
     def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
@@ -53,6 +90,13 @@ def forward(self, x):
         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 
 
+class BoT(Bottleneck):
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):
+        super().__init__(c1, c2, shortcut, g, e)
+        c_ = int(c2 * e)
+        self.cv2 = Transformer(c_, c2, 4)
+
+
 class BottleneckCSP(nn.Module):
     # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
@@ -87,6 +131,13 @@ def forward(self, x):
         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
 
 
+class C3T(C3):
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)
+        self.m = nn.Sequential(*[BoT(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
+
+
 class SPP(nn.Module):
     # Spatial pyramid pooling layer used in YOLOv3-SPP
     def __init__(self, c1, c2, k=(5, 9, 13)):

diff --git a/models/yolo.py b/models/yolo.py
@@ -210,7 +210,7 @@ def parse_model(d, ch):  # model_dict, input_channels(3)
                 pass
 
         n = max(round(n * gd), 1) if n > 1 else n  # depth gain
-        if m in [Conv, Bottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, C3]:
+        if m in [Conv, Bottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, C3, C3T]:
             c1, c2 = ch[f], args[0]
 
             # Normal

diff --git a/models/yolotrs.yaml b/models/yolotrs.yaml
@@ -0,0 +1,48 @@
+# parameters
+nc: 80  # number of classes
+depth_multiple: 0.33  # model depth multiple
+width_multiple: 0.50  # layer channel multiple
+
+# anchors
+anchors:
+  - [10,13, 16,30, 33,23]  # P3/8
+  - [30,61, 62,45, 59,119]  # P4/16
+  - [116,90, 156,198, 373,326]  # P5/32
+
+# YOLOv5 backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, C3, [128]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 9, C3, [256]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 9, C3, [512]],
+   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
+   [-1, 1, SPP, [1024, [5, 9, 13]]],
+   [-1, 3, C3T, [1024, False]],  # 9
+  ]
+
+# YOLOv5 head
+head:
+  [[-1, 1, Conv, [512, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, C3, [512, False]],  # 13
+
+   [-1, 1, Conv, [256, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 14], 1, Concat, [1]],  # cat head P4
+   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 10], 1, Concat, [1]],  # cat head P5
+   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
+
+   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
+  ]