diff --git a/.circleci/config.yml b/.circleci/config.yml
index 78e654716..b599cacb7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -94,7 +94,7 @@ install_vissl_dep: &install_vissl_dep
       name: Install Dependencies
       working_directory: ~/vissl
       command: |
-        pip install --progress-bar off torch==1.5.0 torchvision==0.6.0 opencv-python==3.4.2.17
+        pip install --progress-bar off torch==1.7.1 torchvision==0.8.2 opencv-python==3.4.2.17
         pip install --progress-bar off -r requirements.txt
 
 install_apex_gpu: &install_apex_gpu
diff --git a/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k.yaml b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k.yaml
new file mode 100644
index 000000000..324122a28
--- /dev/null
+++ b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+config:
+  DATA:
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+  OPTIMIZER:
+    num_epochs: 30
diff --git a/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_10percent.yaml b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_10percent.yaml
new file mode 100644
index 000000000..6b361377f
--- /dev/null
+++ b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_10percent.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+config:
+  DATA:
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [google-imagenet1k-per10]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [google-imagenet1k-per10]
+  OPTIMIZER:
+    num_epochs: 30
diff --git a/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_1percent.yaml b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_1percent.yaml
new file mode 100644
index 000000000..0138c7b01
--- /dev/null
+++ b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_1percent.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+config:
+  DATA:
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [google-imagenet1k-per01]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [google-imagenet1k-per01]
+  OPTIMIZER:
+    num_epochs: 60
diff --git a/configs/config/benchmark/imagenet1k_fulltune/eval_vit_8gpu_transfer_in1k_finetune.yaml b/configs/config/benchmark/imagenet1k_fulltune/eval_vit_8gpu_transfer_in1k_finetune.yaml
new file mode 100644
index 000000000..188c4e289
--- /dev/null
+++ b/configs/config/benchmark/imagenet1k_fulltune/eval_vit_8gpu_transfer_in1k_finetune.yaml
@@ -0,0 +1,101 @@
+# @package _global_
+config:
+  VERBOSE: False
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 384
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      TRANSFORMS:
+        - name: Resize
+          size: 384
+        - name: CenterCrop
+          size: 384
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: True
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: sgd
+    weight_decay: 0.000
+    momentum: 0.9
+    num_epochs: 30
+    nesterov: True
+    regularize_bn: False
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.01
+          base_lr_batch_size: 256
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.1
+          - name: cosine
+            start_value: 0.1
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 8
+    INIT_METHOD: tcp
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/benchmark/imagenet1k_fulltune/models/vit_b16.yaml b/configs/config/benchmark/imagenet1k_fulltune/models/vit_b16.yaml
new file mode 100644
index 000000000..7af586fc1
--- /dev/null
+++ b/configs/config/benchmark/imagenet1k_fulltune/models/vit_b16.yaml
@@ -0,0 +1,40 @@
+# @package _global_
+config:
+  DATA:
+    TRAIN:
+      BATCHSIZE_PER_REPLICA: 32 # Fits on 16gb GPU
+    TEST:
+      BATCHSIZE_PER_REPLICA: 32
+  MODEL:
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 384
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+      ["mlp", {"dims": [768, 1000]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+      SKIP_LAYERS: [
+        'heads.0.clf.0.weight',
+        'heads.0.clf.0.bias',
+        'num_batches_tracked'
+      ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
diff --git a/configs/config/benchmark/imagenet1k_fulltune/models/vit_s16.yaml b/configs/config/benchmark/imagenet1k_fulltune/models/vit_s16.yaml
new file mode 100644
index 000000000..e3f4b2c65
--- /dev/null
+++ b/configs/config/benchmark/imagenet1k_fulltune/models/vit_s16.yaml
@@ -0,0 +1,40 @@
+# @package _global_
+config:
+  DATA:
+    TRAIN:
+      BATCHSIZE_PER_REPLICA: 128 # Fits on 32gb GPU
+    TEST:
+      BATCHSIZE_PER_REPLICA: 128
+  MODEL:
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 384
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 6
+          HIDDEN_DIM: 384
+          MLP_DIM: 1536
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+      ["mlp", {"dims": [384, 1000]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+      SKIP_LAYERS: [
+        'heads.0.clf.0.weight',
+        'heads.0.clf.0.bias',
+        'num_batches_tracked'
+      ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/deit_s16.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/deit_s16.yaml
new file mode 100644
index 000000000..2cc6adc40
--- /dev/null
+++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/deit_s16.yaml
@@ -0,0 +1,128 @@
+# @package _global_
+config:
+  VERBOSE: False
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR:
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 2048
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 2048
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: False
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_ONLY: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 6
+          HIDDEN_DIM: 384
+          MLP_DIM: 1536
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+          DROP_PATH_RATE: 0.1
+    HEAD:
+      PARAMS: [
+        ["mlp", {"dims": [384, 1000]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: sgd
+    # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+    weight_decay: 0
+    momentum: 0.9
+    num_epochs: 100
+    nesterov: False
+    regularize_bn: True
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.1
+          base_lr_batch_size: 256
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.1
+          - name: cosine
+            start_value: 0.1
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b16.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b16.yaml
new file mode 100644
index 000000000..6adb3d75b
--- /dev/null
+++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b16.yaml
@@ -0,0 +1,126 @@
+# @package _global_
+config:
+  VERBOSE: False
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 2048
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 2048
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: False
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_ONLY: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+        ["mlp", {"dims": [768, 1000]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: sgd
+    # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+    weight_decay: 0
+    momentum: 0.9
+    num_epochs: 100
+    nesterov: False
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.1
+          base_lr_batch_size: 256
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.1
+          - name: cosine
+            start_value: 0.1
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b32.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b32.yaml
new file mode 100644
index 000000000..caf5fa4c1
--- /dev/null
+++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b32.yaml
@@ -0,0 +1,126 @@
+# @package _global_
+config:
+  VERBOSE: False
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 4096
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 4096
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: False
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_ONLY: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 32
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+        ["mlp", {"dims": [768, 1000]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: sgd
+    # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+    weight_decay: 0
+    momentum: 0.9
+    num_epochs: 100
+    nesterov: False
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.1
+          base_lr_batch_size: 256
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.1
+          - name: cosine
+            start_value: 0.1
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_l16.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_l16.yaml
new file mode 100644
index 000000000..0a806c381
--- /dev/null
+++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_l16.yaml
@@ -0,0 +1,127 @@
+# @package _global_
+config:
+  VERBOSE: False
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 1536
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      DATA_PATHS: ["<path to test folder>"]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 1536
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: False
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_ONLY: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 24
+          NUM_HEADS: 16
+          HIDDEN_DIM: 1024
+          MLP_DIM: 4096
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+        ["mlp", {"dims": [1024, 1000]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      # USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: sgd
+    # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+    weight_decay: 0
+    momentum: 0.9
+    num_epochs: 100
+    nesterov: False
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.1
+          base_lr_batch_size: 256
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.1
+          - name: cosine
+            start_value: 0.1
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/moco.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/moco.yaml
new file mode 100644
index 000000000..feef10df9
--- /dev/null
+++ b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/moco.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+config:
+  OPTIMIZER:
+    name: sgd
+    weight_decay: 0.00
+    momentum: 0.9
+    num_epochs: 100
+    nesterov: True
+    regularize_bn: True
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 30.0
+          base_lr_batch_size: 256
+        name: multistep
+        values: [30.0, 3.0, 0.3]
+        milestones: [60, 80]
+        update_interval: epoch
diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/vit_high_lr.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/vit_high_lr.yaml
new file mode 100644
index 000000000..828f7e3a2
--- /dev/null
+++ b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/vit_high_lr.yaml
@@ -0,0 +1,28 @@
+# @package _global_
+config:
+  OPTIMIZER:
+    name: sgd
+    # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+    weight_decay: 0.000001
+    momentum: 0.9
+    num_epochs: 100
+    nesterov: False
+    regularize_bn: True
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.3
+          base_lr_batch_size: 256
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.1
+            end_value: 0.3
+          - name: cosine
+            start_value: 0.3
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, .9]
diff --git a/configs/config/debugging/pretrain/supervised/supervised_1gpu_vision_transformer_debug_integration_cutmix.yaml b/configs/config/debugging/pretrain/supervised/supervised_1gpu_vision_transformer_debug_integration_cutmix.yaml
new file mode 100644
index 000000000..bb0460d17
--- /dev/null
+++ b/configs/config/debugging/pretrain/supervised/supervised_1gpu_vision_transformer_debug_integration_cutmix.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+config:
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 1
+  HOOKS:
+      TENSORBOARD_SETUP:
+        USE_TENSORBOARD: True
+        EXPERIMENT_LOG_DIR: "."
+        FLUSH_EVERY_N_MIN: 20
+  DATA:
+    NUM_DATALOADER_WORKERS: 0
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_debug_folder]
+      BATCHSIZE_PER_REPLICA: 32
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: VisslRandAugment
+          magnitude: 5
+          weight_choice: 0
+        - name: ToTensor
+        - name: RandomErasing
+          p: 1
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: cutmixup_collator
+      COLLATE_FUNCTION_PARAMS: {
+      'cutmix_alpha': 1.0,
+      'label_smoothing': 0.2
+      }
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      #      DATA_PATHS: ["<path to test folder>"]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_debug_folder]
+      BATCHSIZE_PER_REPLICA: 32
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 32
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+        ["vision_transformer_head", {"in_plane": 768, "hidden_dim": 3072,
+            "num_classes": 1000}],
+      ]
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.3
+    num_epochs: 90
+    param_schedulers:
+      lr:
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.00001
+            end_value: 0.003
+          - name: cosine
+            start_value: 0.001
+            end_value: 0.000001
+        interval_scaling: [rescaled, fixed]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 1 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
+  VERBOSE: True
+  LOG_FREQUENCY: 100
+  TEST_ONLY: False
+  TEST_EVERY_NUM_EPOCH: 1
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: fork
diff --git a/configs/config/debugging/pretrain/supervised/supervised_deit_b_integration_debug.yaml b/configs/config/debugging/pretrain/supervised/supervised_deit_b_integration_debug.yaml
new file mode 100644
index 000000000..cb62f7055
--- /dev/null
+++ b/configs/config/debugging/pretrain/supervised/supervised_deit_b_integration_debug.yaml
@@ -0,0 +1,144 @@
+# @package _global_
+config:
+  HOOKS:
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR:  "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 1
+  DATA:
+    NUM_DATALOADER_WORKERS: 0
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_debug_folder]
+      BATCHSIZE_PER_REPLICA: 32
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: VisslRandAugment
+          magnitude: 9
+          magnitude_std: 0.5
+          increasing_severity: True
+        - name: ColorJitter
+          brightness: 0.4
+          contrast: 0.4
+          saturation: 0.4
+          hue: 0.4
+        - name: ToTensor
+        - name: RandomErasing
+          p: 1
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: cutmixup_collator
+      COLLATE_FUNCTION_PARAMS: {
+        "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0.
+        "cutmix_alpha": 1.0, # cutmix alpha value, cutmix is active if > 0.
+        "prob": 1.0, # probability of applying mixup or cutmix per batch or element
+        "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active
+        "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders
+        "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor
+        "num_classes": 1000 # number of classes for target
+      }
+
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_debug_folder]
+      BATCHSIZE_PER_REPLICA: 64
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    TRUNK:
+      NAME: convit
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          CLASSIFIER: token
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+        CONVIT:
+          DROP_PATH_RATE: 0.1 # stochastic depth dropout probability
+          QKV_BIAS: False # Bias for QKV in attention layers.
+          QK_SCALE: False # Scale
+          N_GPSA_LAYERS: 0 # Number of gated positional self-attention layers
+          CLASS_TOKEN_IN_LOCAL_LAYERS: False # Whether to add class token
+          # Determines how much the positional attention is focused on the
+          # patch of maximal attention. "Alpha" in the paper. Equivalent to
+          # the temperature of positional attention softmax.
+          LOCALITY_STRENGTH: 1.
+          # Dimensionality of the relative positional embeddings * 1/3
+          LOCALITY_DIM: 10
+          # Whether to initialize the positional self-attention to be local
+          # (equivalent to a convolution)
+          USE_LOCAL_INIT: True
+    HEAD:
+      PARAMS: [
+        ["mlp", {"dims": [768, 1000]}],
+      ]
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.05
+    num_epochs: 100
+    # We don't want to regularize the position embedding or classification token
+    non_regularized_parameters: [pos_embedding, class_token]
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.0005
+          base_lr_batch_size: 512
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.00001
+            end_value: 0.0005
+          - name: cosine
+            start_value: 0.0005
+            end_value: 0.000001
+        interval_scaling: [rescaled, fixed]
+        update_interval: step
+        lengths: [0.05, 0.95]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 1 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
+  VERBOSE: True
+  LOG_FREQUENCY: 100
+  TEST_ONLY: False
+  TEST_EVERY_NUM_EPOCH: 1
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: fork
diff --git a/configs/config/debugging/pretrain/swav/swav_integration_debug.yaml b/configs/config/debugging/pretrain/swav/swav_integration_debug.yaml
new file mode 100644
index 000000000..fa24d75fa
--- /dev/null
+++ b/configs/config/debugging/pretrain/swav/swav_integration_debug.yaml
@@ -0,0 +1,128 @@
+# @package _global_
+config:
+  VERBOSE: True
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: False
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      PERF_STAT_FREQUENCY: 10
+      ROLLING_BTIME_FREQ: 313
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 0
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_debug_folder]
+      LABEL_TYPE: zero
+      BATCHSIZE_PER_REPLICA: 2
+      TRANSFORMS:
+        - name: ImgPilToMultiCrop
+          total_num_crops: 4
+          size_crops: [224, 224]
+          num_crops: [2, 2]
+          crop_scales: [[0.14, 1], [0.05, 0.14]]
+        - name: RandomHorizontalFlip
+          p: 0.5
+        - name: ImgPilColorDistortion
+          strength: 1.0
+        - name: ImgPilGaussianBlur
+          p: 0.5
+          radius_min: 0.1
+          radius_max: 2.0
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: cutmixup_collator
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+      DROP_LAST: True
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 384
+          MLP_DIM: 1536
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+          DROP_PATH_RATE: 0.1
+          HYBRID:
+    HEAD:
+      PARAMS: [
+        ["swav_head", {"dims": [384, 2048, 256], "use_bn": True, "num_clusters":
+          [3000]}],
+      ]
+    TEMP_FROZEN_PARAMS_ITER_MAP: [
+      ['module.heads.0.prototypes0.weight', 313],
+    ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: True
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: swav_loss
+    swav_loss:
+      temperature: 0.1
+      use_double_precision: False
+      normalize_last_layer: True
+      num_iters: 3
+      epsilon: 0.04
+      crops_for_assign: [0, 1]
+      queue:
+        queue_length: 3072
+        start_iter: 0
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.05
+    num_epochs: 300
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.001
+          base_lr_batch_size: 4096
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.001
+          - name: cosine
+            start_value: 0.001
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.03, 0.97]
+  METERS:
+    name: ""
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 1 # 1 GPU
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/pretrain/vision_transformer/moco/vit_b16.yaml b/configs/config/pretrain/vision_transformer/moco/vit_b16.yaml
new file mode 100644
index 000000000..8c0269f8e
--- /dev/null
+++ b/configs/config/pretrain/vision_transformer/moco/vit_b16.yaml
@@ -0,0 +1,118 @@
+# @package _global_
+config:
+  VERBOSE: True
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: False
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR:
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 128
+      LABEL_TYPE: sample_index    # just an implementation detail. Label isn't used
+      TRANSFORMS:
+        - name: ImgReplicatePil
+          num_times: 2
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+          p: 0.5
+        - name: ImgPilColorDistortion
+          strength: 1.0
+        - name: ImgPilGaussianBlur
+          p: 0.5
+          radius_min: 0.1
+          radius_max: 2.0
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: moco_collator
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+      DROP_LAST: True
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+      ["mlp", {"dims": [768, 2048], "use_relu": True}],
+      ["mlp", {"dims": [2048, 128]}],
+      ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      # USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: moco_loss
+    moco_loss:
+      embedding_dim: 128
+      queue_size: 65536
+      momentum: 0.999
+      temperature: 0.2
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.05
+    num_epochs: 300
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.0005
+          base_lr_batch_size: 4096
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.003
+          - name: cosine
+            start_value: 0.003
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: ""
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 4
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: "60215"
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/pretrain/vision_transformer/simclr/vit_b16.yaml b/configs/config/pretrain/vision_transformer/simclr/vit_b16.yaml
new file mode 100644
index 000000000..ae89d8422
--- /dev/null
+++ b/configs/config/pretrain/vision_transformer/simclr/vit_b16.yaml
@@ -0,0 +1,117 @@
+# @package _global_
+config:
+  VERBOSE: True
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: False
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR: "/checkpoint/ito/vision_transformer/simclr/b16"
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "/checkpoint/ito/vision_transformer/simclr/b16"
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 64
+      LABEL_TYPE: sample_index    # just an implementation detail. Label isn't used
+      TRANSFORMS:
+        - name: ImgReplicatePil
+          num_times: 2
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+          p: 0.5
+        - name: ImgPilColorDistortion
+          strength: 1.0
+        - name: ImgPilGaussianBlur
+          p: 0.5
+          radius_min: 0.1
+          radius_max: 2.0
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: simclr_collator
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+      DROP_LAST: True
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+      ["mlp", {"dims": [768, 3072], "use_relu": True}],
+      ["mlp", {"dims": [3072, 128]}],
+      ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      # USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: simclr_info_nce_loss
+    simclr_info_nce_loss:
+      temperature: 0.1
+      buffer_params:
+        embedding_dim: 128
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.3
+    num_epochs: 300
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.003
+          base_lr_batch_size: 4096
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.003
+          - name: cosine
+            start_value: 0.003
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: ""
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 4
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: "60215"
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/pretrain/vision_transformer/simclr/vit_l16.yaml b/configs/config/pretrain/vision_transformer/simclr/vit_l16.yaml
new file mode 100644
index 000000000..c7599cb6d
--- /dev/null
+++ b/configs/config/pretrain/vision_transformer/simclr/vit_l16.yaml
@@ -0,0 +1,120 @@
+# @package _global_
+config:
+  VERBOSE: True
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: False
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR: "/checkpoint/ito/vision_transformer/simclr/l32"
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "/checkpoint/ito/vision_transformer/simclr/l32"
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 16
+      LABEL_TYPE: sample_index    # just an implementation detail. Label isn't used
+      TRANSFORMS:
+        - name: ImgReplicatePil
+          num_times: 2
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+          p: 0.5
+        - name: ImgPilColorDistortion
+          strength: 1.0
+        - name: ImgPilGaussianBlur
+          p: 0.5
+          radius_min: 0.1
+          radius_max: 2.0
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: simclr_collator
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+      DROP_LAST: True
+  MODEL:
+    GRAD_CLIP:
+      USE_GRAD_CLIP: True
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 24
+          NUM_HEADS: 16
+          HIDDEN_DIM: 1024
+          MLP_DIM: 4096
+          DROPOUT_RATE: 0.1
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+    HEAD:
+      PARAMS: [
+      ["mlp", {"dims": [1024, 4096], "use_relu": True}],
+      ["mlp", {"dims": [4096, 128]}],
+      ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: False
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: simclr_info_nce_loss
+    simclr_info_nce_loss:
+      temperature: 0.1
+      buffer_params:
+        embedding_dim: 128
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.3
+    #    momentum: 0.9
+    num_epochs: 300
+    #    nesterov: True
+    #    regularize_bn: False
+    #    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.003
+          base_lr_batch_size: 4096
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.003
+          - name: cosine
+            start_value: 0.003
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.1, 0.9]
+  METERS:
+    name: ""
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 4
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: "60215"
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/pretrain/vision_transformer/supervised/supervised_16gpu_deit_b_example.yaml b/configs/config/pretrain/vision_transformer/supervised/supervised_16gpu_deit_b_example.yaml
new file mode 100644
index 000000000..14dca695a
--- /dev/null
+++ b/configs/config/pretrain/vision_transformer/supervised/supervised_16gpu_deit_b_example.yaml
@@ -0,0 +1,130 @@
+# @package _global_
+config:
+  HOOKS:
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 1
+  DATA:
+    NUM_DATALOADER_WORKERS: 8
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 32
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: VisslRandAugment
+          magnitude: 9
+          magnitude_std: 0.5
+          increasing_severity: True
+        - name: ColorJitter
+          brightness: 0.4
+          contrast: 0.4
+          saturation: 0.4
+          hue: 0.4
+        - name: ToTensor
+        - name: RandomErasing
+          p: 0.25
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: cutmixup_collator
+      COLLATE_FUNCTION_PARAMS: {
+        "mixup_alpha": 0.8, # mixup alpha value, mixup is active if > 0.
+        "cutmix_alpha": 1.0, # cutmix alpha value, cutmix is active if > 0.
+        "prob": 1.0, # probability of applying mixup or cutmix per batch or element
+        "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active
+        "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders
+        "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor
+        "num_classes": 1000 # number of classes for target
+      }
+
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 64
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          CLASSIFIER: token
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+          DROP_PATH_RATE: 0.1 # stochastic depth dropout probability
+    HEAD:
+      PARAMS: [
+        ["mlp", {"dims": [768, 1000]}],
+      ]
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.05
+    num_epochs: 100
+    # We don't want to regularize the position embedding or classification token
+    non_regularized_parameters: [pos_embedding, class_token]
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.0005
+          base_lr_batch_size: 512
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.00001
+            end_value: 0.0005
+          - name: cosine
+            start_value: 0.0005
+            end_value: 0.000001
+        interval_scaling: [rescaled, fixed]
+        update_interval: step
+        lengths: [0.05, 0.95]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 2
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: "60521"
+  MACHINE:
+    DEVICE: gpu
+  VERBOSE: True
+  LOG_FREQUENCY: 100
+  TEST_ONLY: False
+  TEST_EVERY_NUM_EPOCH: 1
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: fork
diff --git a/configs/config/pretrain/vision_transformer/supervised/supervised_1gpu_vit_example.yaml b/configs/config/pretrain/vision_transformer/supervised/supervised_1gpu_vit_example.yaml
new file mode 100644
index 000000000..69f760ef9
--- /dev/null
+++ b/configs/config/pretrain/vision_transformer/supervised/supervised_1gpu_vit_example.yaml
@@ -0,0 +1,125 @@
+# @package _global_
+config:
+  VERBOSE: True
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: True
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      ROLLING_BTIME_FREQ: 313
+      PERF_STAT_FREQUENCY: 10
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR:
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR:
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 10
+  DATA:
+    NUM_DATALOADER_WORKERS: 6
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      LABEL_SOURCES: [disk_folder]
+      LABEL_TYPE: sample_index    # just an implementation detail. Label isn't used
+      BATCHSIZE_PER_REPLICA: 128
+      TRANSFORMS:
+        - name: ColorJitter
+          brightness: 0.4
+          contrast: 0.4
+          saturation: 0.4
+          hue: 0.4
+        - name: RandomResizedCrop
+          size: 224
+        - name: VisslAutoAugment
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+        - name: RandomErasing
+          p: 0.25
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+          CLASSIFIER: token
+          DROP_PATH_RATE: 0.1
+    HEAD:
+      PARAMS: [
+      ["vision_transformer_head", {"in_plane": 768, "hidden_dim": 3072,
+                                   "num_classes": 1000}],
+      ]
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: True
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+    AMP_PARAMS:
+      USE_AMP: True
+      # USE_AMP: True
+      AMP_ARGS: {"opt_level": "O1"}
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.05
+    num_epochs: 300
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.0005
+          base_lr_batch_size: 1024
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0
+            end_value: 0.0005
+          - name: cosine
+            start_value: 0.0005
+            end_value: 0
+        interval_scaling: [rescaled, rescaled]
+        update_interval: step
+        lengths: [0.017, 0.983]
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 4
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: "60215"
+  MACHINE:
+    DEVICE: gpu
diff --git a/configs/config/pretrain/vision_transformer/swav/swav_deit_b_2nodes.yaml b/configs/config/pretrain/vision_transformer/swav/swav_deit_b_2nodes.yaml
new file mode 100644
index 000000000..3c82d2dfb
--- /dev/null
+++ b/configs/config/pretrain/vision_transformer/swav/swav_deit_b_2nodes.yaml
@@ -0,0 +1,129 @@
+# @package _global_
+config:
+  HOOKS:
+    TENSORBOARD_SETUP:
+      USE_TENSORBOARD: True
+      EXPERIMENT_LOG_DIR: "/checkpoint/ito/vision_transformer/1gpu_test"
+      FLUSH_EVERY_N_MIN: 20
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 1
+  DATA:
+    NUM_DATALOADER_WORKERS: 8
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      #      DATA_PATHS: ["<path to train folder>"]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_debug_folder]
+      LABEL_TYPE: "zero"
+      BATCHSIZE_PER_REPLICA: 16
+      TRANSFORMS:
+        - name: ImgPilToMultiCrop
+          total_num_crops: 2
+          size_crops: [224]
+          num_crops: [2]
+          crop_scales: [[0.14, 1]]
+        - name: RandomHorizontalFlip
+        - name: VisslRandAugment
+          magnitude: 9
+          magnitude_std: 0.5
+          increasing_severity: True
+        - name: ColorJitter
+          brightness: 0.4
+          contrast: 0.4
+          saturation: 0.4
+          hue: 0.4
+        - name: ToTensor
+        - name: RandomErasing
+          p: 1
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: cutmixup_collator
+      COLLATE_FUNCTION_PARAMS: {
+        "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0.
+        "cutmix_alpha": 1.0, # cutmix alpha value, cutmix is active if > 0.
+        "prob": 1.0, # probability of applying mixup or cutmix per batch or element
+        "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active
+        "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders
+        "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor
+        "num_classes": 1000 # number of classes for target
+      }
+  MODEL:
+    TRUNK:
+      NAME: vision_transformer
+      TRUNK_PARAMS:
+        VISION_TRANSFORMERS:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 16
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          CLASSIFIER: token
+          DROPOUT_RATE: 0
+          ATTENTION_DROPOUT_RATE: 0
+          DROP_PATH_RATE: 0.1 # stochastic depth dropout probability
+    HEAD:
+      PARAMS: [
+      ["swav_head", {"dims": [768, 2048, 128], "use_bn": True, "num_clusters":
+        [3000]}],
+      ]
+    TEMP_FROZEN_PARAMS_ITER_MAP: [
+    ['module.heads.0.prototypes0.weight', 313],
+    ]
+  LOSS:
+    name: swav_loss
+    swav_loss:
+      temperature: 0.1
+      use_double_precision: False
+      normalize_last_layer: True
+      num_iters: 3
+      epsilon: 0.05
+      crops_for_assign: [0, 1]
+      queue:
+        queue_length: 0
+        start_iter: 0
+  OPTIMIZER:
+    name: adamw
+    weight_decay: 0.05
+    num_epochs: 300
+    # We don't want to regularize the position embedding or classification token
+    non_regularized_parameters: [pos_embedding, class_token]
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.0005
+          base_lr_batch_size: 512
+        name: composite
+        schedulers:
+          - name: linear
+            start_value: 0.0005
+            end_value: 0.0005
+          - name: cosine
+            start_value: 0.0005
+            end_value: 0.000001
+        interval_scaling: [rescaled, fixed]
+        update_interval: step
+        lengths: [0.05, 0.95]
+  METERS:
+    name: ""
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 2
+    NUM_PROC_PER_NODE: 8 # 1 GPU
+    RUN_ID: "60521"
+  MACHINE:
+    DEVICE: gpu
+  VERBOSE: True
+  LOG_FREQUENCY: 100
+  TEST_ONLY: False
+  TEST_EVERY_NUM_EPOCH: 1
+  TEST_MODEL: False
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: fork
diff --git a/extra_scripts/experiment_spreadsheet_from_logs.py b/extra_scripts/experiment_spreadsheet_from_logs.py
new file mode 100644
index 000000000..735704607
--- /dev/null
+++ b/extra_scripts/experiment_spreadsheet_from_logs.py
@@ -0,0 +1,303 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import argparse
+import ast
+import collections
+import os
+import pathlib
+import re
+import typing
+
+import pandas as pd
+
+
+def parse_log(log_path: str, args) -> dict:
+    config = parse_config_from_log(log_path)
+    # Check to make sure config not empty
+    if config:
+        config = flatten(config)
+        if args.parse_date_time:
+            date_time = None
+            try:
+                date_time = parse_date_time(
+                    config[args.date_time_param],
+                    args.date_time_pattern,
+                    args.date_time_split_char,
+                )
+            except BaseException:
+                pass
+            if not date_time:
+                print("Unable to parse date/time")
+                date_time = [None, None]
+            update_config_date_time(config, date_time)
+    return config
+
+
+def parse_config_from_log(log_path: str) -> dict:
+    # String prepending beginning of config
+    config_start_split_on = r"hydra_config.py: \d*: "
+    # String at start of config
+    config_start = "{'CHECKPOINT': "
+    # String on final line of config
+    config_end = "'VERBOSE': "
+    config = ""
+    # Flag to indicate the config portion of the log has been read
+    config_finished = False
+
+    # World size info from config is not reliable. Use the
+    # String prepending beginning of world size info
+    world_size_string = "WORLD_SIZE:"
+    world_size_btwn = ("WORLD_SIZE:\t", "\n")
+    world_size = None
+
+    train_losses = []
+    train_loss_str = "loss:"
+    loss_string_btwn = ("loss: ", ";")
+
+    latest_epoch = 0
+    epoch_string = "[ep: "
+    epoch_regex = r"(?<=\[ep: )\d{1,5}(?=\])"
+
+    accuracies = {
+        "train": {"string": "train_accuracy_list_meter", "values": []},
+        "test": {"string": "test_accuracy_list_meter", "values": []},
+    }
+
+    with open(log_path) as reader:
+        store_line = False
+        # # There are some logs in which the config is printed multiple times.
+        # # config_read_complete is used to avoid reading more than one config
+        # # printing.
+        # config_read_complete = False
+        for line in reader:
+            if not store_line:
+                if world_size_string in line:
+                    world_size = line
+                if train_loss_str in line:
+                    train_losses.append(line)
+                for partition in accuracies.keys():
+                    if accuracies[partition]["string"] in line:
+                        accuracies[partition]["values"].append(line)
+            if not config_finished:
+                if config_start in line:
+                    store_line = True
+                    line = re.split(config_start_split_on, line)[1]
+                if store_line:
+                    config += line
+                if config_end in line:
+                    store_line = False
+                    config_finished = True
+            if epoch_string in line:
+                epoch = re.search(epoch_regex, line)
+                if epoch:
+                    latest_epoch = int(epoch.group(0))
+
+    if config:
+        # Parse into dict
+        try:
+            config = ast.literal_eval(config)
+            config = collections.OrderedDict(config)
+        except BaseException:
+            print("Unable to parse dictionary")
+            config = {}
+        # Add latest epoch to config
+        config["latest_epoch"] = latest_epoch
+        # Parse world size from string
+        try:
+            world_size = world_size.split(world_size_btwn[0])[1]
+            world_size = world_size.split(world_size_btwn[1])[0]
+            world_size = int(world_size)
+            # Add to dict
+            config["WORLD_SIZE"] = world_size
+        except BaseException:
+            print("Unable to parse world size")
+        try:
+            final_loss = train_losses[-1]
+            final_loss = final_loss.split(loss_string_btwn[0])[1]
+            final_loss = final_loss.split(loss_string_btwn[1])[0]
+            config["final_train_loss"] = final_loss
+        except BaseException:
+            print("Unable to parse final training loss")
+        for partition, partition_contents in accuracies.items():
+            if partition_contents["values"]:
+                try:
+                    final_accuracy_string = partition_contents["values"][-1]
+                    for top_string in ["top_1", "top_5"]:
+                        acc = final_accuracy_string.split("value")[1].split(top_string)
+                        acc = acc[1].split("0: ")[1]
+                        acc = acc.split("}")[0]
+                        param_str = f"final_{partition}_accuracy_{top_string}"
+                        config[param_str] = float(acc)
+                except BaseException:
+                    print(f"Unable to parse final {partition} accuracy")
+    else:
+        print("No information parsed from log file")
+        config = {}
+
+    return config
+
+
+def flatten(d: collections.abc.MutableMapping, parent_key: str = "", sep: str = "."):
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, collections.abc.MutableMapping):
+            items.extend(flatten(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return collections.OrderedDict(items)
+
+
+def parse_date_time(
+    str_to_parse: str = None, pattern: str = None, split_char: str = None
+):
+    instances = re.findall(pattern, str_to_parse)
+    if instances:
+        date_time = instances[0].split(split_char)
+        return date_time
+
+
+def update_config_date_time(
+    config: collections.OrderedDict, date_time: typing.Union[list, tuple]
+):
+    config["date"] = date_time[0]
+    config["time"] = date_time[1]
+    config.move_to_end("time", last=False)
+    config.move_to_end("date", last=False)
+
+
+def get_latest_checkpoint(directory: pathlib.PosixPath, args: argparse.Namespace):
+    latest_checkpoint = None
+    checkpoint_files = list(directory.glob(f"*{args.checkpoint_id_pattern}*"))
+    if checkpoint_files:
+        latest_checkpoint = 0
+        for checkpoint_file in checkpoint_files:
+            checkpoint_file = str(checkpoint_file).split("/")[-1]
+            checkpoint_epoch = re.findall(
+                args.checkpoint_extract_pattern, checkpoint_file
+            )
+            checkpoint_epoch = int(checkpoint_epoch[0])
+            if checkpoint_epoch > latest_checkpoint:
+                latest_checkpoint = checkpoint_epoch
+        pass
+    else:
+        print("Unable to parse latest checkpoint information")
+    return latest_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--root_directory",
+        nargs="*",
+        type=str,
+        help="Directory or directories containing experiment "
+        "run or subdirectories of runs",
+    )
+    parser.add_argument(
+        "--output_directory",
+        type=str,
+        default=os.getcwd(),
+        help="Where to save output.",
+    )
+    parser.add_argument(
+        "--output_name", type=str, default="experiments.txt", help="Output filename"
+    )
+    parser.add_argument(
+        "--parse_date_time",
+        type=bool,
+        default=True,
+        help="Parse date and time from config",
+    )
+    parser.add_argument(
+        "--date_time_param",
+        type=str,
+        default="CHECKPOINT.DIR",
+        help="config param from whose value the date and time will be parsed",
+    )
+    parser.add_argument(
+        "--date_time_pattern",
+        type=str,
+        default="[0-9]{4}-[0-9][0-9]-[0-9][0-9]/[0-9][0-9]-[0-9][0-9]-[0-9][0-9]",
+        help="Regex pattern for date and time format",
+    )
+    parser.add_argument(
+        "--date_time_split_char",
+        type=str,
+        default="/",
+        help="character to split date and time string into " "separate strings",
+    )
+    parser.add_argument(
+        "--log_file_name_pattern",
+        type=str,
+        default="log.txt",
+        help="pattern to match for log " "file names",
+    )
+    parser.add_argument(
+        "--parse_checkpoint",
+        type=bool,
+        default=True,
+        help="Parse # training epochs from checkpoint file",
+    )
+    parser.add_argument(
+        "--checkpoint_id_pattern",
+        type=str,
+        default="_phase",
+        help="pattern to match for " "checkpoint file names",
+    )
+    parser.add_argument(
+        "--checkpoint_extract_pattern",
+        type=str,
+        default=r"phase([0-9]{1,4})\.torch",
+        help="pattern to extract epoch # from checkpoint file name",
+    )
+
+    args = parser.parse_args()
+
+    log_files = []
+    for directory in args.root_directory:
+        log_files.extend(
+            list(pathlib.Path(directory).rglob(args.log_file_name_pattern))
+        )
+
+    configs_to_concat = []
+    for f in log_files:
+        did_not_add = True
+        print(f"\nParsing {f}")
+        config = parse_log(str(f), args)
+        if args.parse_checkpoint:
+            last_checkpoint = get_latest_checkpoint(f.parent, args)
+            config["last_checkpoint_phase"] = last_checkpoint
+        if args.parse_checkpoint and config["last_checkpoint_phase"]:
+            configs_to_concat.append(config)
+            did_not_add = False
+        elif not args.parse_checkpoint:
+            configs_to_concat.append(config)
+            did_not_add = False
+        if did_not_add:
+            print(f"Did not add\n{f}\nto file")
+        if not did_not_add:
+            print(f"Added \n{f}\nto file")
+    df = pd.DataFrame(configs_to_concat)
+    # Sort columns
+    df = df.reindex(sorted(df.columns), axis=1)
+    # Move specific columns to beginning. Columns are listed here in reverse
+    # order. The final item in the list will be the first column.
+    prepend_columns = [
+        "final_train_loss",
+        "final_test_loss",
+        "final_train_accuracy_top_1",
+        "final_train_accuracy_top_5",
+        "final_test_accuracy_top_1",
+        "final_test_accuracy_top_5",
+        "latest_epoch",
+        "last_checkpoint_phase",
+        "time",
+        "date",
+    ]
+    for prepend_column in prepend_columns:
+        if prepend_column in df.columns:
+            df.insert(0, prepend_column, df.pop(prepend_column))
+    output_full_path = os.path.join(args.output_directory, args.output_name)
+    df.to_csv(output_full_path)
+    print(f"Saved {output_full_path}")
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
index 05bd70e22..1e5a226dd 100644
--- a/tests/test_mlp.py
+++ b/tests/test_mlp.py
@@ -49,9 +49,7 @@ def test_mlp_catch_bad_shapes(self):
 
     def test_eval_mlp_shape(self):
         eval_mlp = LinearEvalMLP(
-            self.MODEL_CONFIG,
-            in_channels=2048,
-            dims=[2048 * 2 * 2, 1000],
+            self.MODEL_CONFIG, in_channels=2048, dims=[2048 * 2 * 2, 1000]
         )
 
         resnet_feature_map = torch.randn(size=(4, 2048, 2, 2))
diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml
index 0e350fbe4..880b75f29 100644
--- a/vissl/config/defaults.yaml
+++ b/vissl/config/defaults.yaml
@@ -166,10 +166,12 @@ config:
       DATA_PATHS: []
       LABEL_SOURCES: []
       LABEL_PATHS: []
-      # either standard | sample_index
+      # either standard | sample_index | zero
       # sample_index is a common practice in self-supervised learning and sample_index = id of the
       # sample in the data.
       # standard label type is used for supervised learning and user specifis the labels to use.
+      # zero sets all labels to 0, which is necessary when using necessary
+      # when cutmixup_collator is being used for self-supervised training.
       LABEL_TYPE: "standard"
       # whether to memory map the input data.
       MMAP_MODE: True
@@ -180,11 +182,69 @@ config:
       # number of unique samples in minibatch per gpu (or per device)
       BATCHSIZE_PER_REPLICA: 256
       # list of data transforms to apply on the data
+      # Example: using RandAugment (https://arxiv.org/abs/1909.13719)
+      # :param magnitude: integer magnitude of rand augment
+      # :param magnitude_std: standard deviation of magnitude. If > 0,
+      # introduces random variability in the augmentation magnitude.
+      # :param num_layers: integer number of transforms
+      # :param increasing_severity: boolean that indicates whether to use
+      # augmentations that increase severity w/ increasing magnitude. Some
+      # augmentations do this by default.
+      # :param choice_weights: Index of pre-determined probability distribution
+      # over augmentations. Currently only one such distribution available (i.e.
+      # no valid values other than 0 or None), unclear if beneficial. Default =
+      # None.
+      # TRANSFORMS:
+      #   - name: VisslRandAugment
+      #     magnitude: 9
+      #     magnitude_std: 0.5
+      #     num_layers: 2
+      #     increasing_severity: True
+      #
+      #
+      # Example: using AutoAugment (https://arxiv.org/abs/1805.09501). This
+      # autoaugment differs from the torchvision implementation by allowing
+      # variability in the augmentation intensity.
+      # ":param policy_name: String. One of 'v0', 'v0r', 'original', 'originalr'.
+      # One of a set of learned augmentation sequences.
+      # :param magnitude_std: standard deviation of magnitude. If > 0, introduces
+      # random variability in the augmentation magnitude.
+      # TRANSFORMS:
+      #   - name: VisslAutoAugment
+      #     policy_name: v0
+      #     magnitude_std: 0
       TRANSFORMS: []
       # collator to use: either pytorch default or user defined custom collator.
+      # Using the cutmixup_collator in a supervised setting requires the use
+      # of the cross_entropy_multiple_output_single_target loss (see LOSS
+      # section below in order to accomodate label-smoothing. Using the
+      # cutmixup_collator in a self-supervised setting requires setting
+      # DATA.{TRAIN/TEST}.LABEL_TYPE: zero
       COLLATE_FUNCTION: "default_collate"
       # parameters taken by the collator function (if any).
       COLLATE_FUNCTION_PARAMS: {}
+      # Example: params for cutmixup_collator to implement CutMix and MixUp
+      # COLLATE_FUNCTION: "cutmixup_collator"
+      # COLLATE_FUNCTION_PARAMS: {
+      #   #  Adjust collator output to accomodate SSL method.
+      #   #  Currently supports "moco" or "simclr".
+      #   #  No argument needed if using vissl or supervised.
+      #  "ssl_method": "moco"
+      #  "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0.
+      #  "cutmix_alpha": 0.0, # cutmix alpha value, cutmix is active if > 0.
+      #  "cutmix_minmax": None, # cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+      #  "prob": 1.0, # probability of applying mixup or cutmix per batch or element
+      #  "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active
+      #  "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+      #  "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders
+      #  "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor
+      #  "num_classes": 1000 # number of classes for target
+      # }
+      # Also note that using the CutMixUp collator in a supervised context
+      # requires using the cross_entropy_multiple_output_single_target to
+      # accomodate the smoothed labels. See
+      # LOSS.cross_entropy_multiple_output_single_target for more information.
+      #
       # limit the amount of data used in training. If set to -1, full dataset is used.
       DATA_LIMIT: -1
       # whether the data specified (whether file list or directory) should be copied locally
@@ -352,6 +412,14 @@ config:
       #       ]
       LINEAR_EVAL_FEAT_POOL_OPS_MAP: []
     # ----------------------------------------------------------------------------------- #
+    # GRADIENT CLIPPING. Used by Dosovitskiy et al. in their  Vision
+    # Transformer paper.
+    # ----------------------------------------------------------------------------------- #
+    GRAD_CLIP: # See TORCH.NN.UTILS.CLIP_GRAD_NORM_
+      USE_GRAD_CLIP: False
+      NORM_TYPE: 2 # Float, int, or 'inf'
+      MAX_NORM: 1
+    # ----------------------------------------------------------------------------------- #
     # MODEL TRUNK
     # ----------------------------------------------------------------------------------- #
     TRUNK:
@@ -364,7 +432,12 @@ config:
         RESNETS:
           DEPTH: 50
           WIDTH_MULTIPLIER: 1
-          NORM: BatchNorm    # BatchNorm | LayerNorm
+          NORM: BatchNorm    # BatchNorm | LayerNorm | GroupNorm
+          # If using GroupNorm, this sets number of groups. Recommend 32 as a
+          # naive suggestion. GroupNorm only available for ResNe(X)t.
+          GROUPNORM_GROUPS: 32
+          # Use weight-standardized convolutions
+          STANDARDIZE_CONVOLUTIONS: False
           GROUPS: 1
           ZERO_INIT_RESIDUAL: False
           WIDTH_PER_GROUP: 64
@@ -383,6 +456,35 @@ config:
         # RegNet params
         # ------------------------------------------------------------- #
         REGNET: {}
+
+        # ------------------------------------------------------------- #
+        # Vision Transformer/DeiT params. Using a name will
+        # override/ignore all other VISION_TRANSFORMERS parameters. Named
+        # options include vit_b_32, vit_b_16, vit_l_32, vit_l_16, vit_h_14.
+        # Using
+        # ------------------------------------------------------------- #
+        VISION_TRANSFORMERS:
+          name:
+          IMAGE_SIZE: 224
+          PATCH_SIZE: 16
+          NUM_LAYERS: 12
+          NUM_HEADS: 12
+          HIDDEN_DIM: 768
+          MLP_DIM: 3072
+          # MLP and projection layer dropout rate
+          DROPOUT_RATE: 0
+          # Attention dropout rate
+          ATTENTION_DROPOUT_RATE: 0
+          # Use the token for classification. Currently no alternatives
+          # supported
+          CLASSIFIER: token
+          # Stochastic depth dropout rate. Turning on stochastic depth and
+          # using aggressive augmentation is essentially the difference
+          # between a DeiT and a ViT.
+          DROP_PATH_RATE: 0
+          QKV_BIAS: False # Bias for QKV in attention layers.
+          QK_SCALE: False # Scale
+
     # ----------------------------------------------------------------------------------- #
     # MODEL HEAD
     # ----------------------------------------------------------------------------------- #
@@ -511,7 +613,16 @@ config:
       ignore_index: -1
 
     # ----------------------------------------------------------------------------------- #
-    # Cross-Entropy Loss for multiple input and same target
+    # Cross-Entropy Loss for multiple outputs and same target. For a single
+    # output, this is equivalent to the cross-entropy loss. For multiple
+    # outputs, this computes the sum of the cross-entropy losses for each
+    # tensor in the list against the target. Can also accomodate target
+    # vectors in addition to single integer targets, for example when using
+    # label smoothing. Note that the internally, cross_entropy_multiple_output_single_target
+    # determines whether each sample is associated with a single target or
+    # whether each sample is associated with a target vector, and uses vanilla
+    # CrossEntropyLoss for the single-target case and a custom cross entropy
+    # function for the multi-target case.
     # ----------------------------------------------------------------------------------- #
     cross_entropy_multiple_output_single_target:
       weight: null
@@ -663,10 +774,17 @@ config:
     nesterov: False
     # for how many epochs to do training. only counts training epochs.
     num_epochs: 90
+    betas: [.9, .999] # for Adam/AdamW
     # whether to regularize batch norm. if set to False, weight decay of batch norm params is 0.
     regularize_bn: False
     # whether to regularize bias parameter. if set to False, weight decay of bias params is 0.
     regularize_bias: True
+    # Parameters to omit from regularization. Any named parameter whose name
+    # contains any of these strings will be omitted from regularization.
+    # For example, we don't want to regularize the class token or position
+    # embeddings in the vision transformer, so we pass:
+    # non_regularized_parameters: ['class_token', 'pos_embedding']
+    non_regularized_parameters: []
     # we support using a different LR and weight decay for head and trunk.
     # one needs to set the flag "use_different_values: True" in order to enable
     # this functionality. We use the same type of param scheduler for the trunk and head
@@ -735,6 +853,7 @@ config:
         end_value: 0.0
         # =====constant learning rate specific =======
         value: 0.1
+
   # ----------------------------------------------------------------------------------- #
   # CLUSTERFIT APPROACH (https://arxiv.org/abs/1912.03330)
   # ----------------------------------------------------------------------------------- #
diff --git a/vissl/data/collators/cutmixup_collator.py b/vissl/data/collators/cutmixup_collator.py
new file mode 100644
index 000000000..4f5bfb394
--- /dev/null
+++ b/vissl/data/collators/cutmixup_collator.py
@@ -0,0 +1,502 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+This implementation is based on
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/mixup.py,
+published under an Apache License 2.0, with modifications by Matthew Leavitt
+(ito@fb.com; matthew.l.leavitt@gmail.com). Modifications are described here and
+notated where present in the code.
+
+Modifications:
+- _mix_batch.__call__() now checks device of data its passed, and passes
+device argument accordingly. Previous behavior allowed called functions to
+default to using cuda, which caused an error when using CPU-based data.
+
+COMMENT FROM ORIGINAL:
+Mixup and Cutmix
+Papers:
+mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)
+CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899) # NOQA
+Code Reference:
+CutMix: https://github.com/clovaai/CutMix-PyTorch
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import collections.abc as abc
+from typing import Any, Dict, Optional
+
+import numpy as np
+import torch
+from classy_vision.generic.util import convert_to_one_hot
+from torch.distributions.beta import Beta
+from vissl.data.collators import register_collator
+
+from .moco_collator import moco_collator
+from .simclr_collator import simclr_collator
+
+
+# TODO: Uncomment in future update when calling via ClassyVision
+# from classy_vision.dataset.transforms import mixup as classy_cutmixup
+
+
+# Modification/addition
+@register_collator("cutmixup_collator")
+def cutmixup_collator(batch, **kwargs):
+    """
+    This collator implements CutMix (https://arxiv.org/abs/1905.04899) and/or
+    MixUp (https://arxiv.org/abs/1710.09412) via ClassyVision's
+    implementation (link when publicly available).
+
+    kwargs:
+    :mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+    :cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+    :cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active
+    and uses this vs alpha if not None.
+    :prob (float): probability of applying mixup or cutmix per batch or element
+    :switch_prob (float): probability of switching to cutmix instead of mixup
+    when both are active
+    :mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of
+    elements), 'elem' (element)
+    :correct_lam (bool): apply lambda correction when cutmix bbox clipped by
+    image borders
+    :label_smoothing (float): apply label smoothing to the mixed target tensor
+    :num_classes (int): number of classes for target
+
+
+    The collators collates the batch for the following input (assuming k-copies of image):
+
+    Input:
+        batch: Example
+                batch = [
+                    {"data" : [img1_0, ..., img1_k], ..},
+                    {"data" : [img2_0, ..., img2_k], ...},
+                    ...
+                ]
+
+    Returns: Example output:
+                output = {
+                            "data": torch.tensor([img1_0, ..., imgN_0],
+                                [img1_k, ..., imgN_k]) ..
+                         }
+    """
+    assert "data" in batch[0], "data not found in sample"
+    assert "label" in batch[0], "label not found in sample"
+
+    data = [x["data"] for x in batch]
+    labels = [torch.tensor(x["label"]) for x in batch]
+    data_valid = [torch.tensor(x["data_valid"]) for x in batch]
+    data_idx = [torch.tensor(x["data_idx"]) for x in batch]
+    num_duplicates, num_images = len(data[0]), len(data)
+
+    # Determine ssl method and adjust collator output accordingly
+    ssl_method = None
+    if "ssl_method" in kwargs.keys():
+        ssl_method = kwargs.pop("ssl_method")
+
+    # Instantiate CutMix + Mixup (CutMixUp!) object
+    cutmixup_transform_obj = Mixup(**kwargs)
+    # TODO: Uncomment in future update when calling via ClassyVision
+    # cutmixup_transform_obj = classy_cutmixup.Mixup(**kwargs)
+
+    output_data, output_label, output_data_valid, output_data_idx = [], [], [], []
+    for pos in range(num_duplicates):
+        cutmixup_data, cutmixup_labels = [], []
+        for idx in range(num_images):
+            cutmixup_data.append(data[idx][pos])
+            cutmixup_labels.append(labels[idx][pos])
+            output_data_valid.append(data_valid[idx][pos])
+            output_data_idx.append(data_idx[idx][pos])
+        # Get data and labels into format accepted by Mixup
+        cutmixup_data = torch.stack(cutmixup_data)
+        cutmixup_labels = torch.tensor(cutmixup_labels)
+        cutmixup_output = cutmixup_transform_obj(
+            {"input": cutmixup_data, "target": cutmixup_labels}
+        )
+        output_data.append(cutmixup_output["input"])
+        output_label.append(cutmixup_output["target"])
+
+    # If using moco or simclr, first restructure the data back into the form
+    # in which it was originally input, then call the collator for that ssl
+    # method
+    if ssl_method == "moco" or ssl_method == "simclr":
+        output_batch = data_back_to_input_form(
+            output_data, output_label, output_data_valid, output_data_idx
+        )
+        if ssl_method == "moco":
+            return moco_collator(output_batch)
+        elif ssl_method == "simclr":
+            return simclr_collator(output_batch)
+    output_batch = {
+        "data": [output_data],
+        "label": [torch.cat(output_label)],
+        "data_valid": [torch.stack(output_data_valid)],
+        "data_idx": [torch.stack(output_data_idx)],
+    }
+    return output_batch
+
+
+# Modification/addition
+def data_back_to_input_form(data, labels, data_valid, data_idx):
+    """
+    "De"-collates data back into their form when originally passed.
+    """
+    assert len(data) == len(labels)
+    assert len(data_idx) == len(data_valid)
+    data_input_form = []
+    num_duplicates, num_images = len(data), len(data[0])
+    for sample_i in range(num_images):
+        sample_input_form = {"data": [], "data_valid": [], "data_idx": [], "label": []}
+        for duplicate_i in range(num_duplicates):
+            sample_input_form["data"].append(data[duplicate_i][sample_i])
+            sample_input_form["label"].append(labels[duplicate_i][sample_i])
+            valid_and_idx_i = sample_i + (num_duplicates * duplicate_i)
+            sample_input_form["data_idx"].append(data_idx[valid_and_idx_i])
+            sample_input_form["data_valid"].append(data_valid[valid_and_idx_i])
+        data_input_form.append(sample_input_form)
+    return data_input_form
+
+
+# TODO: Delete everything from here down in future update when calling via
+# ClassyVision
+# Everything from here down is copied directly from
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py
+# unless otherwise noted.
+def _recursive_mixup(sample: Any, permuted_indices: torch.Tensor, coeff: float):
+    if isinstance(sample, (tuple, list)):
+        mixed_sample = []
+        for s in sample:
+            mixed_sample.append(_recursive_mixup(s, permuted_indices, coeff))
+
+        return mixed_sample if isinstance(sample, list) else tuple(mixed_sample)
+    elif isinstance(sample, abc.Mapping):
+        mixed_sample = {}
+        for key, val in sample.items():
+            mixed_sample[key] = _recursive_mixup(val, permuted_indices, coeff)
+
+        return mixed_sample
+    else:
+        assert torch.is_tensor(sample), "sample is expected to be a pytorch tensor"
+        # Assume training data is at least 3D tensor (i.e. 1D data). We only
+        # mixup content data tensor (e.g. video clip, audio spectrogram), and skip
+        # other tensors, such as frame_idx and timestamp in video clip samples.
+        if sample.ndim >= 3:
+            sample = coeff * sample + (1.0 - coeff) * sample[permuted_indices, :]
+
+        return sample
+
+
+class MixupTransform:
+    """
+    This implements the mixup data augmentation in the paper
+    "mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412)
+    """
+
+    def __init__(self, alpha: float, num_classes: Optional[int] = None):
+        """
+        Args:
+            alpha: the hyperparameter of Beta distribution used to sample mixup
+            coefficient.
+            num_classes: number of classes in the dataset.
+        """
+        self.alpha = alpha
+        self.num_classes = num_classes
+
+    def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Args:
+            sample: the batch data.
+        """
+        if sample["target"].ndim == 1:
+            assert self.num_classes is not None, "num_classes is expected for 1D target"
+            sample["target"] = convert_to_one_hot(
+                sample["target"].view(-1, 1), self.num_classes
+            )
+        else:
+            assert sample["target"].ndim == 2, "target tensor shape must be 1D or 2D"
+
+        c = Beta(self.alpha, self.alpha).sample().to(device=sample["target"].device)
+        permuted_indices = torch.randperm(sample["target"].shape[0])
+
+        sample["target"] = (
+            c * sample["target"] + (1.0 - c) * sample["target"][permuted_indices, :]
+        )
+        sample["input"] = _recursive_mixup(sample["input"], permuted_indices, c)
+
+        return sample
+
+
+def one_hot(x, num_classes, on_value=1.0, off_value=0.0, device="cuda"):
+    x = x.long().view(-1, 1)
+    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(
+        1, x, on_value
+    )
+
+
+def mixup_target(target, num_classes, lam=1.0, smoothing=0.0, device="cuda"):
+    off_value = smoothing / num_classes
+    on_value = 1.0 - smoothing + off_value
+    y1 = one_hot(
+        target, num_classes, on_value=on_value, off_value=off_value, device=device
+    )
+    y2 = one_hot(
+        target.flip(0),
+        num_classes,
+        on_value=on_value,
+        off_value=off_value,
+        device=device,
+    )
+    return y1 * lam + y2 * (1.0 - lam)
+
+
+def rand_bbox(img_shape, lam, margin=0.0, count=None):
+    """Standard CutMix bounding-box
+    Generates a random square bbox based on lambda value. This impl includes
+    support for enforcing a border margin as percent of bbox dimensions.
+    Args:
+        img_shape (tuple): Image shape as tuple
+        lam (float): Cutmix lambda value
+        margin (float): Percentage of bbox dimension to enforce as margin
+            (reduce amount of box outside image)
+        count (int): Number of bbox to generate
+    """
+    ratio = np.sqrt(1 - lam)
+    img_h, img_w = img_shape[-2:]
+    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+    yl = np.clip(cy - cut_h // 2, 0, img_h)
+    yh = np.clip(cy + cut_h // 2, 0, img_h)
+    xl = np.clip(cx - cut_w // 2, 0, img_w)
+    xh = np.clip(cx + cut_w // 2, 0, img_w)
+    return yl, yh, xl, xh
+
+
+def rand_bbox_minmax(img_shape, minmax, count=None):
+    """Min-Max CutMix bounding-box
+    Inspired by Darknet cutmix impl, generates a random rectangular bbox
+    based on min/max percent values applied to each dimension of the input image.
+    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9
+    range for max.
+    Args:
+        img_shape (tuple): Image shape as tuple
+        minmax (tuple or list): Min and max bbox ratios (as percent of image
+        size)
+        count (int): Number of bbox to generate
+    """
+    assert len(minmax) == 2
+    img_h, img_w = img_shape[-2:]
+    cut_h = np.random.randint(
+        int(img_h * minmax[0]), int(img_h * minmax[1]), size=count
+    )
+    cut_w = np.random.randint(
+        int(img_w * minmax[0]), int(img_w * minmax[1]), size=count
+    )
+    yl = np.random.randint(0, img_h - cut_h, size=count)
+    xl = np.random.randint(0, img_w - cut_w, size=count)
+    yu = yl + cut_h
+    xu = xl + cut_w
+    return yl, yu, xl, xu
+
+
+def cutmix_bbox_and_lam(
+    img_shape, lam, ratio_minmax=None, correct_lam=True, count=None
+):
+    """Generate bbox and apply lambda correction."""
+    if ratio_minmax is not None:
+        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
+    else:
+        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
+    if correct_lam or ratio_minmax is not None:
+        bbox_area = (yu - yl) * (xu - xl)
+        lam = 1.0 - bbox_area / float(img_shape[-2] * img_shape[-1])
+    return (yl, yu, xl, xu), lam
+
+
+class Mixup:
+    """Mixup/Cutmix that applies different params to each element or whole batch
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is
+        active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or
+        element
+        switch_prob (float): probability of switching to cutmix instead of
+        mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair
+        of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by
+        image borders
+        label_smoothing (float): apply label smoothing to the mixed target
+        tensor
+        num_classes (int): number of classes for target
+    """
+
+    def __init__(
+        self,
+        mixup_alpha=1.0,
+        cutmix_alpha=0.0,
+        cutmix_minmax=None,
+        prob=1.0,
+        switch_prob=0.5,
+        mode="batch",
+        correct_lam=True,
+        label_smoothing=0.1,
+        num_classes=1000,
+    ):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = (
+            correct_lam  # correct lambda based on clipped area for cutmix
+        )
+        self.mixup_enabled = (
+            True  # set to false to disable mixing (intended tp be set by train loop)
+        )
+
+    def _params_per_elem(self, batch_size):
+        lam = np.ones(batch_size, dtype=np.float32)
+        use_cutmix = np.zeros(batch_size, dtype=np.bool)
+        if self.mixup_enabled:
+            if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0:
+                use_cutmix = np.random.rand(batch_size) < self.switch_prob
+                lam_mix = np.where(
+                    use_cutmix,
+                    np.random.beta(
+                        self.cutmix_alpha, self.cutmix_alpha, size=batch_size
+                    ),
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size),
+                )
+            elif self.mixup_alpha > 0.0:
+                lam_mix = np.random.beta(
+                    self.mixup_alpha, self.mixup_alpha, size=batch_size
+                )
+            elif self.cutmix_alpha > 0.0:
+                use_cutmix = np.ones(batch_size, dtype=np.bool)
+                lam_mix = np.random.beta(
+                    self.cutmix_alpha, self.cutmix_alpha, size=batch_size
+                )
+            else:
+                assert AssertionError, (
+                    "One of mixup_alpha > 0., cutmix_alpha > 0.,"
+                    "cutmix_minmax not None should be true."
+                )
+            lam = np.where(
+                np.random.rand(batch_size) < self.mix_prob,
+                lam_mix.astype(np.float32),
+                lam,
+            )
+        return lam, use_cutmix
+
+    def _params_per_batch(self):
+        lam = 1.0
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = (
+                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+                    if use_cutmix
+                    else np.random.beta(self.mixup_alpha, self.mixup_alpha)
+                )
+            elif self.mixup_alpha > 0.0:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.0:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert AssertionError, (
+                    "One of mixup_alpha > 0., cutmix_alpha > 0.,"
+                    "cutmix_minmax not None should be true."
+                )
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_elem(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.0:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam,
+                    )
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.0:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam,
+                    )
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.0:
+            return 1.0
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                x.shape,
+                lam,
+                ratio_minmax=self.cutmix_minmax,
+                correct_lam=self.correct_lam,
+            )
+            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+        else:
+            x_flipped = x.flip(0).mul_(1.0 - lam)
+            x.mul_(lam).add_(x_flipped)
+        return lam
+
+    def __call__(self, sample):
+        x = sample["input"]
+        target = sample["target"]
+        assert len(x) % 2 == 0, "Batch size should be even when using this"
+        if self.mode == "elem":
+            lam = self._mix_elem(x)
+        elif self.mode == "pair":
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        # Modified to pass device argument based on target.device to prevent
+        # failure on CPU-based data.
+        target = mixup_target(
+            target, self.num_classes, lam, self.label_smoothing, device=target.device
+        )
+        return {"input": x, "target": target}
diff --git a/vissl/data/ssl_dataset.py b/vissl/data/ssl_dataset.py
index b35a631c5..2e966b9db 100644
--- a/vissl/data/ssl_dataset.py
+++ b/vissl/data/ssl_dataset.py
@@ -235,6 +235,15 @@ def __getitem__(self, idx):
             item["data_idx"].append(idx)
             item["data_valid"].append(1 if valid else -1)
 
+        # There are three types of label_type (data labels): "standard",
+        # "sample_index", and "zero". "standard" uses the labels associated
+        # with a data set (e.g. directory names). "sample_index" assigns each
+        # sample a label that corresponds to that sample's index in the
+        # dataset (first sample will have label == 0, etc.), and is used for
+        # SSL tasks in which the label is arbitrary. "zero" assigns
+        # each sample the label == 0, which is necessary when using the
+        # CutMixUp collator because of the label smoothing that is built in
+        # to its functionality.
         if (len(self.label_objs) > 0) or self.label_type == "standard":
             item["label"] = []
             for source in self.label_objs:
@@ -247,6 +256,10 @@ def __getitem__(self, idx):
             item["label"] = []
             for _ in range(len(self.data_objs)):
                 item["label"].append(idx)
+        elif self.label_type == "zero":
+            item["label"] = []
+            for _ in range(len(self.data_objs)):
+                item["label"].append(0)
         else:
             raise ValueError(f"Unknown label type: {self.label_type}")
 
diff --git a/vissl/data/ssl_transforms/rand_auto_aug.py b/vissl/data/ssl_transforms/rand_auto_aug.py
new file mode 100644
index 000000000..00b49f3f4
--- /dev/null
+++ b/vissl/data/ssl_transforms/rand_auto_aug.py
@@ -0,0 +1,721 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+This implementation is based on
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py,
+pulished under an Apache License 2.0, with modifications by Matthew Leavitt (
+ito@fb.com; matthew.l.leavitt@gmail.com). Modifications are described here and
+notated where present in the code.
+
+Modifications:
+-Removed AugMix functionality.
+-Replaced AutoAugment and RandAugment classes, which are no longer passed a
+single parameter string that needs to be parsed, but instead individual,
+named parameters.
+
+COMMENT FROM ORIGINAL:
+AutoAugment, RandAugment, and AugMix for PyTorch
+This code implements the searched ImageNet policies with various tweaks and
+improvements and does not include any of the search code. AA and RA
+Implementation adapted from:
+    https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
+AugMix adapted from:
+    https://github.com/google-research/augmix
+Papers:
+    AutoAugment: Learning Augmentation Policies from Data
+    https://arxiv.org/abs/1805.09501
+    Learning Data Augmentation Strategies for Object Detection
+    https://arxiv.org/abs/1906.11172
+    RandAugment: Practical automated data augmentation...
+    https://arxiv.org/abs/1909.13719
+    AugMix: A Simple Data Processing Method to Improve Robustness and
+    Uncertainty https://arxiv.org/abs/1912.02781
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import math
+import random
+import re
+
+import numpy as np
+import PIL
+from classy_vision.dataset.transforms import register_transform
+from classy_vision.dataset.transforms.classy_transform import ClassyTransform
+from PIL import Image, ImageEnhance, ImageOps
+
+
+# TODO: Uncomment in future update when calling via ClassyVision
+# from classy_vision.dataset.transforms.timm_autoaugment import \
+#     _RAND_TRANSFORMS, _RAND_INCREASING_TRANSFORMS, rand_augment_ops, \
+#     _HPARAMS_DEFAULT, _select_rand_weights, auto_augment_policy
+
+
+# TODO: Delete in future update when calling via ClassyVision
+_PIL_VER = tuple(int(x) for x in PIL.__version__.split(".")[:2])
+
+# TODO: Delete in future update when calling via ClassyVision
+_FILL = (128, 128, 128)
+
+# TODO: Delete in future update when calling via ClassyVision
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.0
+
+# TODO: Delete in future update when calling via ClassyVision
+_HPARAMS_DEFAULT = {"translate_const": 250, "img_mean": _FILL}
+
+# TODO: Delete in future update when calling via ClassyVision
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+# Modification/Addition
+@register_transform("VisslRandAugment")
+class RandAugment(ClassyTransform):
+    """
+    Create a RandAugment transform.
+    :param magnitude: integer magnitude of rand augment
+    :param magnitude_std: standard deviation of magnitude. If > 0, introduces
+    random variability in the augmentation magnitude.
+    :param num_layers: integer number of transforms
+    :param increasing_severity: boolean that indicates whether to use
+    augmentations that increase severity w/ increasing magnitude. Some
+    augmentations do this by default.
+    :param weight_choice: Index of pre-determined probability distribution
+    over augmentations. Currently only one such distribution available (i.e.
+    no valid values other than 0 or None), unclear if beneficial. Default =
+    None.
+    """
+
+    def __init__(
+        self,
+        magnitude=10,
+        magnitude_std=0,
+        num_layers=2,
+        increasing_severity=False,
+        weight_choice=None,
+        **kwargs
+    ):
+        hparams = kwargs
+        hparams.update(_HPARAMS_DEFAULT)
+        hparams["magnitude_std"] = magnitude_std
+        if increasing_severity:
+            transforms = _RAND_INCREASING_TRANSFORMS
+        else:
+            transforms = _RAND_TRANSFORMS
+        self.num_layers = num_layers
+        self.choice_weights = (
+            None if weight_choice is None else _select_rand_weights(weight_choice)
+        )
+        self.ops = rand_augment_ops(
+            magnitude=magnitude, hparams=hparams, transforms=transforms
+        )
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops,
+            self.num_layers,
+            replace=self.choice_weights is None,
+            p=self.choice_weights,
+        )
+        for op in ops:
+            img = op(img)
+        return img
+
+
+# Modification/Addition
+@register_transform("VisslAutoAugment")
+class AutoAugment(ClassyTransform):
+    """
+    Create a AutoAugment transform. This autoaugment differs from the
+    torchvision implementation by allowing variability in the augmentation
+    intensity.
+    ":param policy_name: String. One of 'v0', 'v0r', 'original', 'originalr'.
+    One of a set of learned augmentation sequences.
+    :param magnitude_std: standard deviation of magnitude. If > 0, introduces
+    random variability in the augmentation magnitude.
+    :kwargs: Other params for the AutoAugmentation scheme. See RandAugment
+    class above, or AugmentOp class in ClassyVision. Probability and
+    intensity are overwritten because they're determined by the learned
+    AutoAugment policy.
+    """
+
+    def __init__(self, policy_name="v0", magnitude_std=0, **kwargs):
+        hparams = kwargs
+        hparams.update(_HPARAMS_DEFAULT)
+        hparams["magnitude_std"] = magnitude_std
+        self.policy = auto_augment_policy(policy_name, hparams=hparams)
+
+    def __call__(self, img):
+        sub_policy = random.choice(self.policy)
+        for op in sub_policy:
+            img = op(img)
+        return img
+
+
+# TODO: Delete everything from here down in future update when calling via
+# ClassyVision
+# Everything from here down is copied directly from
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py
+def _interpolation(kwargs):
+    interpolation = kwargs.pop("resample", Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if "fillcolor" in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop("fillcolor")
+    kwargs["resample"] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(
+            -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix
+        )
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs["resample"])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.0
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0
+    # increases the enhancement blend range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * 0.9
+    level = 1.0 + _randomly_negate(level)
+    return (level,)
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams["translate_const"]
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get("translate_pct", 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return (int((level / _MAX_LEVEL) * 4),)
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return (4 - _posterize_level_to_arg(level, hparams)[0],)
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return (int((level / _MAX_LEVEL) * 4) + 4,)
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return (int((level / _MAX_LEVEL) * 256),)
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return (256 - _solarize_level_to_arg(level, _hparams)[0],)
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return (int((level / _MAX_LEVEL) * 110),)
+
+
+LEVEL_TO_ARG = {
+    "AutoContrast": None,
+    "Equalize": None,
+    "Invert": None,
+    "Rotate": _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various
+    # Tensorflow/Google repositories/papers
+    "Posterize": _posterize_level_to_arg,
+    "PosterizeIncreasing": _posterize_increasing_level_to_arg,
+    "PosterizeOriginal": _posterize_original_level_to_arg,
+    "Solarize": _solarize_level_to_arg,
+    "SolarizeIncreasing": _solarize_increasing_level_to_arg,
+    "SolarizeAdd": _solarize_add_level_to_arg,
+    "Color": _enhance_level_to_arg,
+    "ColorIncreasing": _enhance_increasing_level_to_arg,
+    "Contrast": _enhance_level_to_arg,
+    "ContrastIncreasing": _enhance_increasing_level_to_arg,
+    "Brightness": _enhance_level_to_arg,
+    "BrightnessIncreasing": _enhance_increasing_level_to_arg,
+    "Sharpness": _enhance_level_to_arg,
+    "SharpnessIncreasing": _enhance_increasing_level_to_arg,
+    "ShearX": _shear_level_to_arg,
+    "ShearY": _shear_level_to_arg,
+    "TranslateX": _translate_abs_level_to_arg,
+    "TranslateY": _translate_abs_level_to_arg,
+    "TranslateXRel": _translate_rel_level_to_arg,
+    "TranslateYRel": _translate_rel_level_to_arg,
+}
+
+
+NAME_TO_OP = {
+    "AutoContrast": auto_contrast,
+    "Equalize": equalize,
+    "Invert": invert,
+    "Rotate": rotate,
+    "Posterize": posterize,
+    "PosterizeIncreasing": posterize,
+    "PosterizeOriginal": posterize,
+    "Solarize": solarize,
+    "SolarizeIncreasing": solarize,
+    "SolarizeAdd": solarize_add,
+    "Color": color,
+    "ColorIncreasing": color,
+    "Contrast": contrast,
+    "ContrastIncreasing": contrast,
+    "Brightness": brightness,
+    "BrightnessIncreasing": brightness,
+    "Sharpness": sharpness,
+    "SharpnessIncreasing": sharpness,
+    "ShearX": shear_x,
+    "ShearY": shear_y,
+    "TranslateX": translate_x_abs,
+    "TranslateY": translate_y_abs,
+    "TranslateXRel": translate_x_rel,
+    "TranslateYRel": translate_y_rel,
+}
+
+
+class AugmentOp:
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = {
+            "fillcolor": hparams["img_mean"] if "img_mean" in hparams else _FILL,
+            "resample": hparams["interpolation"]
+            if "interpolation" in hparams
+            else _RANDOM_INTERPOLATION,
+        }
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        self.magnitude_std = self.hparams.get("magnitude_std", 0)
+
+    def __call__(self, img):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = (
+            self.level_fn(magnitude, self.hparams) if self.level_fn is not None else ()
+        )
+        return self.aug_fn(img, *level_args, **self.kwargs)
+
+
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
+    policy = [
+        [("Equalize", 0.8, 1), ("ShearY", 0.8, 4)],
+        [("Color", 0.4, 9), ("Equalize", 0.6, 3)],
+        [("Color", 0.4, 1), ("Rotate", 0.6, 8)],
+        [("Solarize", 0.8, 3), ("Equalize", 0.4, 7)],
+        [("Solarize", 0.4, 2), ("Solarize", 0.6, 2)],
+        [("Color", 0.2, 0), ("Equalize", 0.8, 8)],
+        [("Equalize", 0.4, 8), ("SolarizeAdd", 0.8, 3)],
+        [("ShearX", 0.2, 9), ("Rotate", 0.6, 8)],
+        [("Color", 0.6, 1), ("Equalize", 1.0, 2)],
+        [("Invert", 0.4, 9), ("Rotate", 0.6, 0)],
+        [("Equalize", 1.0, 9), ("ShearY", 0.6, 3)],
+        [("Color", 0.4, 7), ("Equalize", 0.6, 0)],
+        [("Posterize", 0.4, 6), ("AutoContrast", 0.4, 7)],
+        [("Solarize", 0.6, 8), ("Color", 0.6, 9)],
+        [("Solarize", 0.2, 4), ("Rotate", 0.8, 9)],
+        [("Rotate", 1.0, 7), ("TranslateYRel", 0.8, 9)],
+        [("ShearX", 0.0, 0), ("Solarize", 0.8, 4)],
+        [("ShearY", 0.8, 0), ("Color", 0.6, 4)],
+        [("Color", 1.0, 0), ("Rotate", 0.6, 2)],
+        [("Equalize", 0.8, 4), ("Equalize", 0.0, 8)],
+        [("Equalize", 1.0, 4), ("AutoContrast", 0.6, 2)],
+        [("ShearY", 0.4, 7), ("SolarizeAdd", 0.6, 7)],
+        [
+            ("Posterize", 0.8, 2),
+            ("Solarize", 0.6, 10),
+        ],  # This results in black image with Tpu posterize
+        [("Solarize", 0.6, 8), ("Equalize", 0.6, 1)],
+        [("Color", 0.8, 6), ("Rotate", 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
+    # in Google research implementation (number of bits discarded increases with magnitude)
+    policy = [
+        [("Equalize", 0.8, 1), ("ShearY", 0.8, 4)],
+        [("Color", 0.4, 9), ("Equalize", 0.6, 3)],
+        [("Color", 0.4, 1), ("Rotate", 0.6, 8)],
+        [("Solarize", 0.8, 3), ("Equalize", 0.4, 7)],
+        [("Solarize", 0.4, 2), ("Solarize", 0.6, 2)],
+        [("Color", 0.2, 0), ("Equalize", 0.8, 8)],
+        [("Equalize", 0.4, 8), ("SolarizeAdd", 0.8, 3)],
+        [("ShearX", 0.2, 9), ("Rotate", 0.6, 8)],
+        [("Color", 0.6, 1), ("Equalize", 1.0, 2)],
+        [("Invert", 0.4, 9), ("Rotate", 0.6, 0)],
+        [("Equalize", 1.0, 9), ("ShearY", 0.6, 3)],
+        [("Color", 0.4, 7), ("Equalize", 0.6, 0)],
+        [("PosterizeIncreasing", 0.4, 6), ("AutoContrast", 0.4, 7)],
+        [("Solarize", 0.6, 8), ("Color", 0.6, 9)],
+        [("Solarize", 0.2, 4), ("Rotate", 0.8, 9)],
+        [("Rotate", 1.0, 7), ("TranslateYRel", 0.8, 9)],
+        [("ShearX", 0.0, 0), ("Solarize", 0.8, 4)],
+        [("ShearY", 0.8, 0), ("Color", 0.6, 4)],
+        [("Color", 1.0, 0), ("Rotate", 0.6, 2)],
+        [("Equalize", 0.8, 4), ("Equalize", 0.0, 8)],
+        [("Equalize", 1.0, 4), ("AutoContrast", 0.6, 2)],
+        [("ShearY", 0.4, 7), ("SolarizeAdd", 0.6, 7)],
+        [("PosterizeIncreasing", 0.8, 2), ("Solarize", 0.6, 10)],
+        [("Solarize", 0.6, 8), ("Equalize", 0.6, 1)],
+        [("Color", 0.8, 6), ("Rotate", 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501
+    policy = [
+        [("PosterizeOriginal", 0.4, 8), ("Rotate", 0.6, 9)],
+        [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)],
+        [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)],
+        [("PosterizeOriginal", 0.6, 7), ("PosterizeOriginal", 0.6, 6)],
+        [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)],
+        [("Equalize", 0.4, 4), ("Rotate", 0.8, 8)],
+        [("Solarize", 0.6, 3), ("Equalize", 0.6, 7)],
+        [("PosterizeOriginal", 0.8, 5), ("Equalize", 1.0, 2)],
+        [("Rotate", 0.2, 3), ("Solarize", 0.6, 8)],
+        [("Equalize", 0.6, 8), ("PosterizeOriginal", 0.4, 6)],
+        [("Rotate", 0.8, 8), ("Color", 0.4, 0)],
+        [("Rotate", 0.4, 9), ("Equalize", 0.6, 2)],
+        [("Equalize", 0.0, 7), ("Equalize", 0.8, 8)],
+        [("Invert", 0.6, 4), ("Equalize", 1.0, 8)],
+        [("Color", 0.6, 4), ("Contrast", 1.0, 8)],
+        [("Rotate", 0.8, 8), ("Color", 1.0, 2)],
+        [("Color", 0.8, 8), ("Solarize", 0.8, 7)],
+        [("Sharpness", 0.4, 7), ("Invert", 0.6, 8)],
+        [("ShearX", 0.6, 5), ("Equalize", 1.0, 9)],
+        [("Color", 0.4, 0), ("Equalize", 0.6, 3)],
+        [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)],
+        [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)],
+        [("Invert", 0.6, 4), ("Equalize", 1.0, 8)],
+        [("Color", 0.6, 4), ("Contrast", 1.0, 8)],
+        [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [("PosterizeIncreasing", 0.4, 8), ("Rotate", 0.6, 9)],
+        [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)],
+        [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)],
+        [("PosterizeIncreasing", 0.6, 7), ("PosterizeIncreasing", 0.6, 6)],
+        [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)],
+        [("Equalize", 0.4, 4), ("Rotate", 0.8, 8)],
+        [("Solarize", 0.6, 3), ("Equalize", 0.6, 7)],
+        [("PosterizeIncreasing", 0.8, 5), ("Equalize", 1.0, 2)],
+        [("Rotate", 0.2, 3), ("Solarize", 0.6, 8)],
+        [("Equalize", 0.6, 8), ("PosterizeIncreasing", 0.4, 6)],
+        [("Rotate", 0.8, 8), ("Color", 0.4, 0)],
+        [("Rotate", 0.4, 9), ("Equalize", 0.6, 2)],
+        [("Equalize", 0.0, 7), ("Equalize", 0.8, 8)],
+        [("Invert", 0.6, 4), ("Equalize", 1.0, 8)],
+        [("Color", 0.6, 4), ("Contrast", 1.0, 8)],
+        [("Rotate", 0.8, 8), ("Color", 1.0, 2)],
+        [("Color", 0.8, 8), ("Solarize", 0.8, 7)],
+        [("Sharpness", 0.4, 7), ("Invert", 0.6, 8)],
+        [("ShearX", 0.6, 5), ("Equalize", 1.0, 9)],
+        [("Color", 0.4, 0), ("Equalize", 0.6, 3)],
+        [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)],
+        [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)],
+        [("Invert", 0.6, 4), ("Equalize", 1.0, 8)],
+        [("Color", 0.6, 4), ("Contrast", 1.0, 8)],
+        [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name="v0", hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    if name == "original":
+        return auto_augment_policy_original(hparams)
+    elif name == "originalr":
+        return auto_augment_policy_originalr(hparams)
+    elif name == "v0":
+        return auto_augment_policy_v0(hparams)
+    elif name == "v0r":
+        return auto_augment_policy_v0r(hparams)
+    else:
+        assert AssertionError, "Unknown AA policy (%s)" % name
+
+
+def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+    :param config_str: String defining configuration of auto augmentation.
+    Consists of multiple sections separated by dashes ('-'). The first
+    section defines the AutoAugment policy (one of 'v0', 'v0r', 'original',
+    'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+    :return: A PyTorch compatible Transform
+    """
+    config = config_str.split("-")
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r"(\d.*)", c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == "mstd":
+            # noise param injected via hparams for now
+            hparams.setdefault("magnitude_std", float(val))
+        else:
+            assert AssertionError, "Unknown AutoAugment config section"
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    "AutoContrast",
+    "Equalize",
+    "Invert",
+    "Rotate",
+    "Posterize",
+    "Solarize",
+    "SolarizeAdd",
+    "Color",
+    "Contrast",
+    "Brightness",
+    "Sharpness",
+    "ShearX",
+    "ShearY",
+    "TranslateXRel",
+    "TranslateYRel",
+]
+
+
+_RAND_INCREASING_TRANSFORMS = [
+    "AutoContrast",
+    "Equalize",
+    "Invert",
+    "Rotate",
+    "PosterizeIncreasing",
+    "SolarizeIncreasing",
+    "SolarizeAdd",
+    "ColorIncreasing",
+    "ContrastIncreasing",
+    "BrightnessIncreasing",
+    "SharpnessIncreasing",
+    "ShearX",
+    "ShearY",
+    "TranslateXRel",
+    "TranslateYRel",
+]
+
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    "Rotate": 0.3,
+    "ShearX": 0.2,
+    "ShearY": 0.2,
+    "TranslateXRel": 0.1,
+    "TranslateYRel": 0.1,
+    "Color": 0.025,
+    "Sharpness": 0.025,
+    "AutoContrast": 0.025,
+    "Solarize": 0.005,
+    "SolarizeAdd": 0.005,
+    "Contrast": 0.005,
+    "Brightness": 0.005,
+    "Equalize": 0.005,
+    "Posterize": 0,
+    "Invert": 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [
+        AugmentOp(name, prob=0.5, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
diff --git a/vissl/hooks/__init__.py b/vissl/hooks/__init__.py
index be02d7c28..132f8f4ea 100644
--- a/vissl/hooks/__init__.py
+++ b/vissl/hooks/__init__.py
@@ -5,9 +5,10 @@
 
 from classy_vision.hooks.classy_hook import ClassyHook
 from vissl.hooks.deepclusterv2_hooks import ClusterMemoryHook, InitMemoryHook  # noqa
+from vissl.hooks.grad_clip_hooks import GradClipHook  # noqa
 from vissl.hooks.log_hooks import (  # noqa
-    LogGpuStatsHook,
     LogGpuMemoryHook,
+    LogGpuStatsHook,
     LogLossLrEtaHook,
     LogLossMetricsCheckpointHook,
     LogPerfTimeMetricsHook,
@@ -109,6 +110,15 @@ def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
         assert is_tensorboard_available(), "Tensorboard must be installed to use it."
         tb_hook = get_tensorboard_hook(cfg)
         hooks.extend([tb_hook])
+    if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP:
+        hooks.extend(
+            [
+                GradClipHook(
+                    norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE,
+                    max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM,
+                )
+            ]
+        )
 
     # hooks that are used irrespective of workflow type
     rolling_btime_freq = (
diff --git a/vissl/hooks/grad_clip_hooks.py b/vissl/hooks/grad_clip_hooks.py
new file mode 100644
index 000000000..7ad5926e8
--- /dev/null
+++ b/vissl/hooks/grad_clip_hooks.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Union
+
+import torch.nn.utils as utils
+from classy_vision import tasks
+from classy_vision.hooks.classy_hook import ClassyHook
+
+
+class GradClipHook(ClassyHook):
+    """
+    Hook executed on a backward pass that clips gradients such that their
+    norm does not exceed a specific value. Dosovitskiy et al. found it
+    to be critical for training vision transformers
+    (https://arxiv.org/abs/2010.11929), but subsequent studies have been less
+    clear about its importance. Gradient clipping configuration is set in
+    config.MODEL.GRAD_CLIP
+    """
+
+    on_start = ClassyHook._noop
+    on_phase_start = ClassyHook._noop
+    on_forward = ClassyHook._noop
+    on_loss_and_meter = ClassyHook._noop
+    on_update = ClassyHook._noop
+    on_step = ClassyHook._noop
+    on_phase_end = ClassyHook._noop
+    on_end = ClassyHook._noop
+
+    def __init__(self, norm_type: Union[int, float, str], max_norm: Union[int, float]):
+        super().__init__()
+        self.norm_type = norm_type
+        self.max_norm = max_norm
+
+    def on_backward(self, task: tasks.ClassyTask) -> None:
+        utils.clip_grad_norm_(
+            task.model.parameters(), max_norm=self.max_norm, norm_type=self.norm_type
+        )
diff --git a/vissl/hooks/log_hooks.py b/vissl/hooks/log_hooks.py
index 835019539..8ac63adbd 100644
--- a/vissl/hooks/log_hooks.py
+++ b/vissl/hooks/log_hooks.py
@@ -34,10 +34,7 @@ class LogGpuMemoryHook(ClassyHook):
     on_phase_end = ClassyHook._noop
     on_end = ClassyHook._noop
 
-    def __init__(
-        self,
-        log_iteration_num: int = 1,
-    ) -> None:
+    def __init__(self, log_iteration_num: int = 1) -> None:
         super().__init__()
         self.log_iteration_num = log_iteration_num
 
diff --git a/vissl/hooks/tensorboard_hook.py b/vissl/hooks/tensorboard_hook.py
index f9dc7a5f1..825c60395 100644
--- a/vissl/hooks/tensorboard_hook.py
+++ b/vissl/hooks/tensorboard_hook.py
@@ -112,6 +112,19 @@ def on_phase_end(self, task: "tasks.ClassyTask") -> None:
         Log model parameters and/or parameter gradients as set by user
         in the tensorboard configuration. Also resents the CUDA memory counter.
         """
+        # Log train/test accuracy
+        if is_primary():
+            phase_type = "Training" if task.train else "Testing"
+            for meter in task.meters:
+                if "accuracy" in meter.name:
+                    for top_n, accuracies in meter.value.items():
+                        for i, acc in accuracies.items():
+                            tag_name = f"{phase_type}/Accuracy_" f" {top_n}_Output_{i}"
+                            self.tb_writer.add_scalar(
+                                tag=tag_name,
+                                scalar_value=round(acc, 5),
+                                global_step=task.train_phase_idx,
+                            )
         if not (self.log_params or self.log_params_gradients):
             return
 
diff --git a/vissl/losses/cross_entropy_multiple_output_single_target.py b/vissl/losses/cross_entropy_multiple_output_single_target.py
index 0e931a18e..ef4607f75 100644
--- a/vissl/losses/cross_entropy_multiple_output_single_target.py
+++ b/vissl/losses/cross_entropy_multiple_output_single_target.py
@@ -1,21 +1,40 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
+import logging
 from typing import List, Union
 
 import torch
+import torch.nn.functional as F
 from classy_vision.generic.util import is_on_gpu
 from classy_vision.losses import ClassyLoss, register_loss
-from torch import nn
+from torch import Tensor, nn
 from vissl.utils.hydra_config import AttrDict
 
 
+class SmoothCrossEntropy(torch.nn.modules.CrossEntropyLoss):
+    """
+    Cross entropy loss that can accommodate smoothed labels
+    """
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        if len(target.shape) > 1:
+            log_probs = F.log_softmax(input, 1)
+            # TODO: Implement weight and ignore_index
+            return -torch.mean(torch.sum(log_probs * target, dim=1))
+        else:
+            return F.cross_entropy(
+                input, target, weight=self.weight, ignore_index=self.ignore_index
+            )
+
+
 @register_loss("cross_entropy_multiple_output_single_target")
 class CrossEntropyMultipleOutputSingleTargetLoss(ClassyLoss):
     """
     Intializer for the sum cross-entropy loss. For a single
     tensor, this is equivalent to the cross-entropy loss. For a
     list of tensors, this computes the sum of the cross-entropy
-    losses for each tensor in the list against the target.
+    losses for each tensor in the list against the target. Can accommodate
+    target vectors, e.g. smoothed labels.
 
     Config params:
         weight: weight of sample, optional
@@ -53,10 +72,15 @@ def from_config(cls, loss_config: AttrDict):
 
     def _create_loss_function(self):
         copy_to_gpu = is_on_gpu(self._losses)
+        logging.info(
+            "Instantiating "
+            "CrossEntropyMultipleOutputSingleTargetLoss, which"
+            "internally uses SmoothCrossEntropy loss to accommodate"
+            "label smoothing, but defaults to vanilla cross-entropy "
+            "if provided single-target labels."
+        )
         self._losses.append(
-            torch.nn.modules.CrossEntropyLoss(
-                weight=self._weight, ignore_index=self._ignore_index
-            )
+            SmoothCrossEntropy(weight=self._weight, ignore_index=self._ignore_index)
         )
         if copy_to_gpu:
             self._losses.cuda()
diff --git a/vissl/losses/moco_loss.py b/vissl/losses/moco_loss.py
index 5a438b163..758e32cf5 100644
--- a/vissl/losses/moco_loss.py
+++ b/vissl/losses/moco_loss.py
@@ -88,9 +88,11 @@ def _dequeue_and_enqueue(self, key: torch.Tensor):
 
         # for simplicity, removes the case where the batch overlaps with the end
         # of the queue
-        assert (
-            self.loss_config.queue_size % batch_size == 0
-        ), "The queue size needs to be a multiple of the batch size"
+        assert self.loss_config.queue_size % batch_size == 0, (
+            f"The queue size needs to be a multiple of the batch size. "
+            f"Effective batch size: {batch_size}. Queue size:"
+            f" {self.loss_config.queue_size}."
+        )
 
         # replace the keys at ptr (dequeue and enqueue)
         ptr = int(self.queue_ptr)
diff --git a/vissl/meters/mean_ap_list_meter.py b/vissl/meters/mean_ap_list_meter.py
index 624535a6b..3b87aeac7 100644
--- a/vissl/meters/mean_ap_list_meter.py
+++ b/vissl/meters/mean_ap_list_meter.py
@@ -60,10 +60,7 @@ def value(self):
         for ind, meter in enumerate(self._meters):
             meter_val = meter.value
             sample_count = meter._scores.shape[0]
-            val_dict[ind] = {
-                "val": meter_val,
-                "sample_count": sample_count,
-            }
+            val_dict[ind] = {"val": meter_val, "sample_count": sample_count}
         output_dict = {}
         output_dict["mAP"] = {}
         output_dict["AP"] = {}
@@ -98,9 +95,7 @@ def get_classy_state(self):
         meter_states = {}
         for ind, meter in enumerate(self._meters):
             state = meter.get_classy_state()
-            meter_states[ind] = {
-                "state": state,
-            }
+            meter_states[ind] = {"state": state}
         return meter_states
 
     def set_classy_state(self, state):
diff --git a/vissl/models/heads/vision_transformer_head.py b/vissl/models/heads/vision_transformer_head.py
new file mode 100644
index 000000000..c27e26215
--- /dev/null
+++ b/vissl/models/heads/vision_transformer_head.py
@@ -0,0 +1,71 @@
+# (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
+
+"""
+Code modified from https://github.com/google-research/vision_transformer
+as per https://arxiv.org/abs/2010.11929
+"""
+
+import copy
+from collections import OrderedDict
+
+import torch.nn as nn
+from vissl.models.heads import register_model_head
+from vissl.models.model_helpers import lecun_normal_init, trunc_normal_
+from vissl.utils.hydra_config import AttrDict
+
+
+@register_model_head("vision_transformer_head")
+class VisionTransformerHead(nn.Module):
+    """
+    Code modified from https://github.com/google-research/vision_transformer
+    and https://www.internalfb.com/D24714842, as per https://arxiv.org/abs/2010.11929
+
+    Authors use a 2-layer MLP for pretraining and a single linear layer for
+    fine-tuning. Thus a pre-training head would be called with something like
+    ["vision_transformer_head", {"in_plane": D, "hidden_dim": D,
+    "num_classes": K}], where D = hidden dimensionality and K = number of
+    classes. A fine-tuning head would be called ["vision_transformer_head",
+    {"in_plane", D, "num_classes": K]. Not passing "hidden_dim" will result
+    in a single linear layer.
+
+    """
+
+    def __init__(self, model_config: AttrDict, in_plane, num_classes, hidden_dim=None):
+        super().__init__()
+        if hidden_dim is None:
+            layers = [("head", nn.Linear(in_plane, num_classes))]
+        else:
+            layers = [
+                ("pre_logits", nn.Linear(in_plane, hidden_dim)),
+                ("act", nn.Tanh()),
+                ("head", nn.Linear(hidden_dim, num_classes)),
+            ]
+        self.layers = nn.Sequential(OrderedDict(layers))
+        self.init_weights()
+
+    def init_weights(self):
+        if hasattr(self.layers, "pre_logits"):
+            lecun_normal_init(
+                self.layers.pre_logits.weight, fan_in=self.layers.pre_logits.in_features
+            )
+            nn.init.zeros_(self.layers.pre_logits.bias)
+        trunc_normal_(self.layers.head.weight, std=0.02)
+        nn.init.zeros_(self.layers.head.bias)
+
+    @classmethod
+    def from_config(cls, config):
+        """
+        config is config.MODEL.HEAD.PARAMS, which is a list of the form:
+        [
+            ["vision_transformer_head", {"in_plane": _, "hidden_dim": _, "num_classes": _}]
+        ]
+        Where in_plane is the input dimensionality to the head, hidden_dim is
+        the hidden layer width (omit if no hidden layer is desired),
+        and num_classes is the output dimensionality.
+        """
+        config = copy.deepcopy(config)
+        config.pop("unique_id")
+        return cls(**config)
+
+    def forward(self, x):
+        return self.layers(x)
diff --git a/vissl/models/model_helpers.py b/vissl/models/model_helpers.py
index 265ecadd5..bb4445a79 100644
--- a/vissl/models/model_helpers.py
+++ b/vissl/models/model_helpers.py
@@ -1,11 +1,15 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
 import logging
+import math
+import warnings
 from enum import Enum
 from typing import Dict, List, Tuple
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _ntuple
 from torch.utils.checkpoint import checkpoint
 from vissl.utils.activation_checkpointing import checkpoint_trunk
 from vissl.utils.misc import is_apex_available
@@ -244,15 +248,23 @@ class RESNET_NORM_LAYER(str, Enum):
 
     BatchNorm = "BatchNorm"
     LayerNorm = "LayerNorm"
+    GroupNorm = "GroupNorm"
 
 
-def _get_norm(layer_name):
+def _get_norm(trunk_config):
     """
     return the normalization layer to use in the model based on the layer name
     """
+    layer_name = trunk_config.NORM
+    n_groups = trunk_config.GROUPNORM_GROUPS
+
+    def group_norm(num_channels):
+        return nn.GroupNorm(num_groups=n_groups, num_channels=num_channels)
+
     return {
         RESNET_NORM_LAYER.BatchNorm: nn.BatchNorm2d,
         RESNET_NORM_LAYER.LayerNorm: LayerNorm2d,
+        RESNET_NORM_LAYER.GroupNorm: group_norm,
     }[layer_name]
 
 
@@ -398,3 +410,154 @@ def get_trunk_forward_outputs(
         output_feats.append(unique_out_feats[key_name])
 
     return output_feats
+
+
+def lecun_normal_init(tensor, fan_in):
+    trunc_normal_(tensor, std=math.sqrt(1 / fan_in))
+
+
+# Contains code from https://github.com/rwightman/pytorch-image-models
+# and https://github.com/facebookresearch/deit/blob/main/models.py, modified by
+# Matthew # Leavitt (ito@fb.com, matthew.l.leavitt@gmail.com) and Vedanuj
+# Goswami (vedanuj@fb.com).
+# trunc_normal_ and _no_grad_trunc_normal_ from:
+# https://github.com/rwightman/pytorch-image-models/blob/678ba4e0a2c0b52c5e7b2ec0ba689399840282ee/timm/models/layers/weight_init.py # NOQA
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    r"""Supposedly should be available in PyTorch soon. Replace when available.
+    Fills the input Tensor with values drawn
+    from a truncated normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+# Contains code from https://github.com/rwightman/pytorch-image-models
+# and https://github.com/facebookresearch/deit/blob/main/models.py, modified by
+# Matthew # Leavitt (ito@fb.com, matthew.l.leavitt@gmail.com) and Vedanuj
+# Goswami (vedanuj@fb.com).
+# Standardized convolution (Conv2d with Weight Standardization), as used in
+# the paper, Big Transfer (BiT): General Visual Representation Learning -
+# https://arxiv.org/abs/1912.11370
+class StandardizedConv2d(nn.Conv2d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        super(StandardizedConv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+
+    def forward(self, x):
+        weight = self.weight
+        weight_mean = (
+            weight.mean(dim=1, keepdim=True)
+            .mean(dim=2, keepdim=True)
+            .mean(dim=3, keepdim=True)
+        )
+        weight = weight - weight_mean
+        std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5
+        weight = weight / std.expand_as(weight)
+        return F.conv2d(
+            x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+
+
+# drop_path and DropPath modified from
+# https://github.com/facebookresearch/deit/blob/main/models.py
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # work with diff dim tensors, not just 2D ConvNets
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path
+    of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
diff --git a/vissl/models/trunks/alexnet_colorization.py b/vissl/models/trunks/alexnet_colorization.py
index 0738e33fe..40376a83b 100644
--- a/vissl/models/trunks/alexnet_colorization.py
+++ b/vissl/models/trunks/alexnet_colorization.py
@@ -2,10 +2,7 @@
 
 import torch
 import torch.nn as nn
-from vissl.models.model_helpers import (
-    Flatten,
-    get_trunk_forward_outputs_module_list,
-)
+from vissl.models.model_helpers import Flatten, get_trunk_forward_outputs_module_list
 from vissl.models.trunks import register_model_trunk
 from vissl.utils.hydra_config import AttrDict
 
@@ -87,9 +84,6 @@ def forward(self, x, out_feat_keys=None):
         # along the channel dimension into [L, AB] and keep only L channel.
         feat = torch.split(feat, [1, 2], dim=1)[0]
         out_feats = get_trunk_forward_outputs_module_list(
-            feat,
-            out_feat_keys,
-            self._feature_blocks,
-            self.all_feat_names,
+            feat, out_feat_keys, self._feature_blocks, self.all_feat_names
         )
         return out_feats
diff --git a/vissl/models/trunks/alexnet_deepcluster.py b/vissl/models/trunks/alexnet_deepcluster.py
index b73e9b723..dff97ad18 100644
--- a/vissl/models/trunks/alexnet_deepcluster.py
+++ b/vissl/models/trunks/alexnet_deepcluster.py
@@ -101,9 +101,6 @@ def forward(self, x, out_feat_keys=None):
         # we first apply sobel filter
         feat = self.sobel(feat)
         out_feats = get_trunk_forward_outputs_module_list(
-            feat,
-            out_feat_keys,
-            self._feature_blocks,
-            self.all_feat_names,
+            feat, out_feat_keys, self._feature_blocks, self.all_feat_names
         )
         return out_feats
diff --git a/vissl/models/trunks/alexnet_jigsaw.py b/vissl/models/trunks/alexnet_jigsaw.py
index 7c07fc614..ff329628b 100644
--- a/vissl/models/trunks/alexnet_jigsaw.py
+++ b/vissl/models/trunks/alexnet_jigsaw.py
@@ -80,9 +80,6 @@ def __init__(self, model_config: AttrDict, model_name: str):
     def forward(self, x, out_feat_keys=None):
         feat = x
         out_feats = get_trunk_forward_outputs_module_list(
-            feat,
-            out_feat_keys,
-            self._feature_blocks,
-            self.all_feat_names,
+            feat, out_feat_keys, self._feature_blocks, self.all_feat_names
         )
         return out_feats
diff --git a/vissl/models/trunks/alexnet_rotnet.py b/vissl/models/trunks/alexnet_rotnet.py
index 48b644cf1..62a9daf3d 100644
--- a/vissl/models/trunks/alexnet_rotnet.py
+++ b/vissl/models/trunks/alexnet_rotnet.py
@@ -63,9 +63,6 @@ def __init__(self, model_config: AttrDict, model_name: str):
     def forward(self, x, out_feat_keys=None):
         feat = x
         out_feats = get_trunk_forward_outputs_module_list(
-            feat,
-            out_feat_keys,
-            self._feature_blocks,
-            self.all_feat_names,
+            feat, out_feat_keys, self._feature_blocks, self.all_feat_names
         )
         return out_feats
diff --git a/vissl/models/trunks/resnext.py b/vissl/models/trunks/resnext.py
index 60e0ccb73..1f260c4ee 100644
--- a/vissl/models/trunks/resnext.py
+++ b/vissl/models/trunks/resnext.py
@@ -67,7 +67,7 @@ def __init__(self, model_config: AttrDict, model_name: str):
         self.trunk_config = self.model_config.TRUNK.TRUNK_PARAMS.RESNETS
         self.depth = SUPPORTED_DEPTHS(self.trunk_config.DEPTH)
         self.width_multiplier = self.trunk_config.WIDTH_MULTIPLIER
-        self._norm_layer = _get_norm(self.trunk_config.NORM)
+        self._norm_layer = _get_norm(self.trunk_config)
         self.groups = self.trunk_config.GROUPS
         self.zero_init_residual = self.trunk_config.ZERO_INIT_RESIDUAL
         self.width_per_group = self.trunk_config.WIDTH_PER_GROUP
diff --git a/vissl/models/trunks/vision_transformer.py b/vissl/models/trunks/vision_transformer.py
new file mode 100644
index 000000000..d869ddaba
--- /dev/null
+++ b/vissl/models/trunks/vision_transformer.py
@@ -0,0 +1,306 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""
+Code modified from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py # NOQA
+and https://github.com/facebookresearch/deit/blob/main/models.py by Matthew
+Leavitt (ito@fb.com, matthew.l.leavitt@gmail.com) and Vedanuj Goswami
+(vedanuj@fb.com).
+"""
+
+import copy
+import logging
+import math
+from typing import List
+
+import torch
+import torch.nn as nn
+from vissl.models.model_helpers import DropPath, to_2tuple, trunc_normal_
+from vissl.models.trunks import register_model_trunk
+from vissl.utils.hydra_config import AttrDict
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version,
+        # can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        if drop_path > 0.0:
+            self.drop_path = DropPath(drop_path)
+        else:
+            self.drop_path = nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+
+    def forward(self, x):
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+@register_model_trunk("vision_transformer")
+class VisionTransformer(nn.Module):
+    """
+    Vision transformer. Adding stochastic depth makes it a DeiT.
+    """
+
+    def __init__(self, model_config: AttrDict, model_name: str):
+        super().__init__()
+
+        assert model_config.INPUT_TYPE in ["rgb", "bgr"], "Input type not supported"
+        trunk_config = copy.deepcopy(
+            model_config.TRUNK.TRUNK_PARAMS.VISION_TRANSFORMERS
+        )
+
+        logging.info("Building model: Vision Transformer from yaml config")
+        # Hacky workaround
+        trunk_config = AttrDict({k.lower(): v for k, v in trunk_config.items()})
+
+        img_size = trunk_config.image_size
+        patch_size = trunk_config.patch_size
+        in_chans = 3
+        embed_dim = trunk_config.hidden_dim
+        depth = trunk_config.num_layers
+        num_heads = trunk_config.num_heads
+        mlp_ratio = 4.0
+        qkv_bias = trunk_config.qkv_bias
+        qk_scale = trunk_config.qk_scale
+        drop_rate = trunk_config.dropout_rate
+        attn_drop_rate = trunk_config.attention_dropout_rate
+        drop_path_rate = trunk_config.drop_path_rate
+        hybrid_backbone_string = None
+        # TODO Implement hybrid backbones
+        if "HYBRID" in trunk_config.keys():
+            hybrid_backbone_string = trunk_config.HYBRID
+        norm_layer = nn.LayerNorm
+
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+
+        # TODO : Enable Hybrid Backbones
+        if hybrid_backbone_string:
+            self.patch_embed = globals()[hybrid_backbone_string](
+                out_dim=embed_dim, img_size=img_size
+            )
+        # if hybrid_backbone is not None:
+        #     self.patch_embed = HybridEmbed(
+        #         hybrid_backbone,
+        #         img_size=img_size,
+        #         in_chans=in_chans,
+        #         embed_dim=embed_dim,
+        #     )
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size,
+                patch_size=patch_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim,
+            )
+        num_patches = self.patch_embed.num_patches
+
+        self.class_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embedding = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+
+        # NOTE as per official impl, we could have a pre-logits
+        # representation dense layer + tanh here
+        # self.repr = nn.Linear(embed_dim, representation_size)
+        # self.repr_act = nn.Tanh()
+
+        trunc_normal_(self.pos_embedding, std=0.02)
+        trunc_normal_(self.class_token, std=0.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embedding", "class_token"}
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        class_tokens = self.class_token.expand(
+            B, -1, -1
+        )  # stole class_tokens impl from Phil Wang, thanks
+        x = torch.cat((class_tokens, x), dim=1)
+        pos_embed = self.interpolate_pos_encoding(x, self.pos_embedding)
+        x = x + pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(
+        self, x: torch.Tensor, out_feat_keys: List[str] = None
+    ) -> List[torch.Tensor]:
+        x = self.forward_features(x)
+        x = x.unsqueeze(0)
+        return x
+
+    def interpolate_pos_encoding(self, x, pos_embed):
+        npatch = x.shape[1] - 1
+        N = pos_embed.shape[1] - 1
+        if npatch == N:
+            return pos_embed
+        class_emb = pos_embed[:, 0]
+        pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        pos_embed = nn.functional.interpolate(
+            pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(
+                0, 3, 1, 2
+            ),
+            scale_factor=math.sqrt(npatch / N),
+            mode="bicubic",
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_emb.unsqueeze(0), pos_embed), dim=1)
diff --git a/vissl/optimizers/optimizer_helper.py b/vissl/optimizers/optimizer_helper.py
index 3d8aa708e..732d375e4 100644
--- a/vissl/optimizers/optimizer_helper.py
+++ b/vissl/optimizers/optimizer_helper.py
@@ -52,6 +52,53 @@ def _filter_trainable(param_list: List[Any]) -> List[Any]:
     return list(filter(lambda x: x.requires_grad, param_list))
 
 
+def _assign_regularized_params(
+    regularized_param_list=None,
+    unregularized_param_list=None,
+    parameters_to_unregularize=None,
+):
+    """
+    Takes a list parameters_to_unregularize (a list of parameters to ensure are
+    not regularized) and compares it to regularized_param_list, a list of
+    regularized parameters. Any parameters in parameters_to_unregularize that
+    are present in regularized_param_list are removed from
+    regularized_param_list. Will also check against an optional
+    unregularized_param_list (pre-existing list of parameters not to regularize)
+    and remove any items from parameters_to_unregularize that are in
+    unregularized_param_list. Used for when we have parameters that we don't
+    want to regularize (e.g. the class token and position embeddings for the
+    vision transformer). See config.OPTIMIZER.non_regularized_params. Needs
+    to be called separately for head, trunk, and remaining params.
+    """
+    indices_to_remove_from_regularized = []
+    indices_to_remove_from_new_unregularized = []
+    # Iterate through new parameters to unregularize
+    for unreg_param_ind, new_unreg_param in enumerate(parameters_to_unregularize):
+        # Iterate through list of regularized parameters
+        for reg_param_ind, reg_param in enumerate(regularized_param_list):
+            # Note any matchess
+            if reg_param is new_unreg_param:
+                indices_to_remove_from_regularized.append(reg_param_ind)
+        if unregularized_param_list:
+            # Iterate through pre-existing list of unregularized parameters
+            for unreg_param in unregularized_param_list:
+                # Note any matches
+                if unreg_param is new_unreg_param:
+                    indices_to_remove_from_new_unregularized.append(unreg_param_ind)
+    indices_to_remove_from_regularized.sort(reverse=True)
+    # Iterate through indices to remove from list regularized params and
+    # remove them
+    for i in indices_to_remove_from_regularized:
+        del regularized_param_list[i]
+    if unregularized_param_list:
+        indices_to_remove_from_new_unregularized.sort(reverse=True)
+        # Iterate through indices to remove from new list of unregularized
+        # parameters
+        for i in indices_to_remove_from_new_unregularized:
+            del parameters_to_unregularize[i]
+    return parameters_to_unregularize, regularized_param_list, unregularized_param_list
+
+
 def get_optimizer_param_groups(
     model, model_config, optimizer_config, optimizer_schedulers
 ):
@@ -91,6 +138,7 @@ def get_optimizer_param_groups(
     head_regularized_params, head_unregularized_params = [], []
     # for anything else
     regularized_params = []
+    unregularized_params = []
     for name, module in model.named_modules():
         # head, Linear/Conv layer
         if "head" in name and (
@@ -140,6 +188,41 @@ def get_optimizer_param_groups(
             for params in module.parameters(recurse=False):
                 regularized_params.append(params)
 
+    # Collect user-specified non-regularized params and remove them for the
+    # lists of regularized params, and check they're not already on the lists
+    # of unregularized params
+    if optimizer_config.non_regularized_parameters:
+        non_reg_param_names = optimizer_config.non_regularized_parameters
+        for name, param in model.named_parameters():
+            hits = [p for p in non_reg_param_names if p in name]
+            if any(hits):
+                unregularized_params.append(param)
+        # Call for trunk params
+        (
+            non_reg_params,
+            trunk_regularized_params,
+            trunk_unregularized_params,
+        ) = _assign_regularized_params(
+            parameters_to_unregularize=unregularized_params,
+            regularized_param_list=trunk_regularized_params,
+            unregularized_param_list=trunk_unregularized_params,
+        )
+        # Call for head params
+        (
+            non_reg_params,
+            head_regularized_params,
+            head_unregularized_params,
+        ) = _assign_regularized_params(
+            parameters_to_unregularize=unregularized_params,
+            regularized_param_list=head_regularized_params,
+            unregularized_param_list=head_unregularized_params,
+        )
+        # Call for remaining params
+        non_reg_params, regularized_params, _ = _assign_regularized_params(
+            parameters_to_unregularize=unregularized_params,
+            regularized_param_list=regularized_params,
+        )
+
     # for non-trainable params, set the requires_grad to False
     non_trainable_params = []
     for name, param in model.named_parameters():
@@ -160,7 +243,8 @@ def get_optimizer_param_groups(
         f"Trunk Unregularized Parameters {len(trunk_unregularized_params)}, \n"
         f"Head Regularized Parameters: {len(head_regularized_params)}, \n"
         f"Head Unregularized Parameters: {len(head_unregularized_params)} \n"
-        f"Remaining Regularized Parameters: {len(regularized_params)} "
+        f"Remaining Regularized Parameters: {len(regularized_params)} \n"
+        f"Remaining Unregularized Parameters: {len(unregularized_params)}"
     )
 
     param_groups = [
@@ -189,5 +273,13 @@ def get_optimizer_param_groups(
         param_groups.append(
             {"params": regularized_params, "lr": optimizer_schedulers["lr"]}
         )
+    if len(unregularized_params) > 0:
+        param_groups.append(
+            {
+                "params": unregularized_params,
+                "lr": optimizer_schedulers["lr"],
+                "weight_decay": 0.0,
+            }
+        )
 
     return param_groups
diff --git a/vissl/trainer/train_task.py b/vissl/trainer/train_task.py
index c53a6f281..5a7eaad2a 100644
--- a/vissl/trainer/train_task.py
+++ b/vissl/trainer/train_task.py
@@ -4,10 +4,7 @@
 import logging
 
 import torch
-from classy_vision.generic.util import (
-    copy_model_to_gpu,
-    load_and_broadcast_checkpoint,
-)
+from classy_vision.generic.util import copy_model_to_gpu, load_and_broadcast_checkpoint
 from classy_vision.losses import build_loss
 from classy_vision.meters import build_meter
 from classy_vision.optim import build_optimizer, build_optimizer_schedulers
diff --git a/vissl/utils/activation_checkpointing.py b/vissl/utils/activation_checkpointing.py
index 0ee636f95..d512418f5 100644
--- a/vissl/utils/activation_checkpointing.py
+++ b/vissl/utils/activation_checkpointing.py
@@ -131,12 +131,7 @@ def checkpoint_trunk(
 
         feature_blocks_bucketed = (
             feature_blocks_bucketed[:i_max]
-            + [
-                [
-                    f"activation_split_{split_times}",
-                    biggest_block[1][:n_split_layers],
-                ]
-            ]
+            + [[f"activation_split_{split_times}", biggest_block[1][:n_split_layers]]]
             + [[biggest_block[0], biggest_block[1][n_split_layers:]]]
             + feature_blocks_bucketed[(i_max + 1) :]
         )
diff --git a/vissl/utils/checkpoint.py b/vissl/utils/checkpoint.py
index f2f17ab1d..8427d1bc9 100644
--- a/vissl/utils/checkpoint.py
+++ b/vissl/utils/checkpoint.py
@@ -445,6 +445,13 @@ def init_model_from_weights(
                     and config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_TRUNK_AND_HEAD
                 )
             ):
+                # Accommodate changing position embeddings. Fine-tuning at a
+                # different resolution than that which a model was pretrained
+                # at requires interpolating the learned position embeddings.
+                if "pos_embedding" in layername:
+                    param = interpolate_position_embeddings(
+                        model, all_layers[layername], param
+                    )
                 assert all_layers[layername].shape == param.shape, (
                     f"{layername} have different shapes: "
                     f"checkpoint: {param.shape}, model: {all_layers[layername].shape}"
@@ -472,3 +479,21 @@ def init_model_from_weights(
     ####################### DEBUG ############################
     # print_state_dict_shapes(model.state_dict())
     return model
+
+
+def interpolate_position_embeddings(model, layer, param):
+    """
+    Fine-tuning at a different resolution than that which a model was
+    pretrained at requires interpolating the learned position embeddings.
+    """
+    if (
+        hasattr(model.trunk, "interpolate_position_embedding")
+        and layer.shape != param.shape
+    ):
+        interp = model.trunk.interpolate_position_embedding
+        if callable(interp):
+            try:
+                param = interp(param)
+            except BaseException:
+                raise RuntimeError("Unable to interpolate position embeddings")
+    return param
diff --git a/vissl/utils/hydra_config.py b/vissl/utils/hydra_config.py
index 06e3e1e52..743702603 100644
--- a/vissl/utils/hydra_config.py
+++ b/vissl/utils/hydra_config.py
@@ -381,6 +381,7 @@ def infer_losses_config(cfg):
         assert cfg.DATA.TRAIN.COLLATE_FUNCTION in [
             "multicrop_collator",
             "multicrop_mixup_collator",
+            "cutmixup_collator",
         ], (
             "for swav loss, use either a collator from "
             "[multicrop_collator, multicrop_mixup_collator]"
@@ -480,10 +481,13 @@ def assert_hydra_conf(cfg):
 
     # in SSL, during pre-training we don't want to use annotated labels or during feature
     # extraction, we don't have annotated labels for some datasets. In such cases, we set
-    # the label type to be just the image index in the dataset.
-    if len(cfg.DATA.TRAIN.LABEL_SOURCES) == 0:
+    # the label type to be just the image index in the dataset, unless the
+    # user has specifically provided "zero" as the label type, which is
+    # necessary when the CutMixUp collator is being used for self-supervised
+    # training.
+    if len(cfg.DATA.TRAIN.LABEL_SOURCES) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero":
         cfg.DATA.TRAIN.LABEL_TYPE = "sample_index"
-    if len(cfg.DATA.TEST.LABEL_SOURCES) == 0:
+    if len(cfg.DATA.TEST.LABEL_SOURCES) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero":
         cfg.DATA.TEST.LABEL_TYPE = "sample_index"
 
     # if the user has specified the model initialization from a params_file, we check if