diff --git a/.circleci/config.yml b/.circleci/config.yml index 78e654716..b599cacb7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -94,7 +94,7 @@ install_vissl_dep: &install_vissl_dep name: Install Dependencies working_directory: ~/vissl command: | - pip install --progress-bar off torch==1.5.0 torchvision==0.6.0 opencv-python==3.4.2.17 + pip install --progress-bar off torch==1.7.1 torchvision==0.8.2 opencv-python==3.4.2.17 pip install --progress-bar off -r requirements.txt install_apex_gpu: &install_apex_gpu diff --git a/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k.yaml b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k.yaml new file mode 100644 index 000000000..324122a28 --- /dev/null +++ b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k.yaml @@ -0,0 +1,13 @@ +# @package _global_ +config: + DATA: + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + OPTIMIZER: + num_epochs: 30 diff --git a/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_10percent.yaml b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_10percent.yaml new file mode 100644 index 000000000..6b361377f --- /dev/null +++ b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_10percent.yaml @@ -0,0 +1,13 @@ +# @package _global_ +config: + DATA: + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [google-imagenet1k-per10] + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [google-imagenet1k-per10] + OPTIMIZER: + num_epochs: 30 diff --git a/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_1percent.yaml b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_1percent.yaml new file mode 100644 index 000000000..0138c7b01 --- /dev/null +++ b/configs/config/benchmark/imagenet1k_fulltune/datasets/imagenet_1k_1percent.yaml @@ -0,0 +1,13 @@ +# @package _global_ +config: + DATA: + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [google-imagenet1k-per01] + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [google-imagenet1k-per01] + OPTIMIZER: + num_epochs: 60 diff --git a/configs/config/benchmark/imagenet1k_fulltune/eval_vit_8gpu_transfer_in1k_finetune.yaml b/configs/config/benchmark/imagenet1k_fulltune/eval_vit_8gpu_transfer_in1k_finetune.yaml new file mode 100644 index 000000000..188c4e289 --- /dev/null +++ b/configs/config/benchmark/imagenet1k_fulltune/eval_vit_8gpu_transfer_in1k_finetune.yaml @@ -0,0 +1,101 @@ +# @package _global_ +config: + VERBOSE: False + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + TRANSFORMS: + - name: RandomResizedCrop + size: 384 + - name: RandomHorizontalFlip + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MMAP_MODE: True + COPY_TO_LOCAL_DISK: False + COPY_DESTINATION_DIR: /tmp/imagenet1k/ + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + TRANSFORMS: + - name: Resize + size: 384 + - name: CenterCrop + size: 384 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MMAP_MODE: True + COPY_TO_LOCAL_DISK: False + COPY_DESTINATION_DIR: /tmp/imagenet1k/ + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: True + TRAINER: + TRAIN_STEP_NAME: standard_train_step + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: sgd + weight_decay: 0.000 + momentum: 0.9 + num_epochs: 30 + nesterov: True + regularize_bn: False + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 0.01 + base_lr_batch_size: 256 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.1 + - name: cosine + start_value: 0.1 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 8 + INIT_METHOD: tcp + RUN_ID: auto + MACHINE: + DEVICE: gpu diff --git a/configs/config/benchmark/imagenet1k_fulltune/models/vit_b16.yaml b/configs/config/benchmark/imagenet1k_fulltune/models/vit_b16.yaml new file mode 100644 index 000000000..7af586fc1 --- /dev/null +++ b/configs/config/benchmark/imagenet1k_fulltune/models/vit_b16.yaml @@ -0,0 +1,40 @@ +# @package _global_ +config: + DATA: + TRAIN: + BATCHSIZE_PER_REPLICA: 32 # Fits on 16gb GPU + TEST: + BATCHSIZE_PER_REPLICA: 32 + MODEL: + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 384 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 1000]}], + ] + WEIGHTS_INIT: + PARAMS_FILE: "specify the model weights" + STATE_DICT_KEY_NAME: classy_state_dict + SKIP_LAYERS: [ + 'heads.0.clf.0.weight', + 'heads.0.clf.0.bias', + 'num_batches_tracked' + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} diff --git a/configs/config/benchmark/imagenet1k_fulltune/models/vit_s16.yaml b/configs/config/benchmark/imagenet1k_fulltune/models/vit_s16.yaml new file mode 100644 index 000000000..e3f4b2c65 --- /dev/null +++ b/configs/config/benchmark/imagenet1k_fulltune/models/vit_s16.yaml @@ -0,0 +1,40 @@ +# @package _global_ +config: + DATA: + TRAIN: + BATCHSIZE_PER_REPLICA: 128 # Fits on 32gb GPU + TEST: + BATCHSIZE_PER_REPLICA: 128 + MODEL: + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 384 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 6 + HIDDEN_DIM: 384 + MLP_DIM: 1536 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [384, 1000]}], + ] + WEIGHTS_INIT: + PARAMS_FILE: "specify the model weights" + STATE_DICT_KEY_NAME: classy_state_dict + SKIP_LAYERS: [ + 'heads.0.clf.0.weight', + 'heads.0.clf.0.bias', + 'num_batches_tracked' + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/deit_s16.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/deit_s16.yaml new file mode 100644 index 000000000..2cc6adc40 --- /dev/null +++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/deit_s16.yaml @@ -0,0 +1,128 @@ +# @package _global_ +config: + VERBOSE: False + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 2048 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 2048 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: False + FEATURE_EVAL_SETTINGS: + EVAL_MODE_ON: True + FREEZE_TRUNK_ONLY: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 6 + HIDDEN_DIM: 384 + MLP_DIM: 1536 + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + DROP_PATH_RATE: 0.1 + HEAD: + PARAMS: [ + ["mlp", {"dims": [384, 1000]}], + ] + WEIGHTS_INIT: + PARAMS_FILE: "specify the model weights" + STATE_DICT_KEY_NAME: classy_state_dict + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: sgd + # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4 + weight_decay: 0 + momentum: 0.9 + num_epochs: 100 + nesterov: False + regularize_bn: True + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 0.1 + base_lr_batch_size: 256 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.1 + - name: cosine + start_value: 0.1 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b16.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b16.yaml new file mode 100644 index 000000000..6adb3d75b --- /dev/null +++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b16.yaml @@ -0,0 +1,126 @@ +# @package _global_ +config: + VERBOSE: False + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 2048 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 2048 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: False + FEATURE_EVAL_SETTINGS: + EVAL_MODE_ON: True + FREEZE_TRUNK_ONLY: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 1000]}], + ] + WEIGHTS_INIT: + PARAMS_FILE: "specify the model weights" + STATE_DICT_KEY_NAME: classy_state_dict + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: sgd + # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4 + weight_decay: 0 + momentum: 0.9 + num_epochs: 100 + nesterov: False + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 0.1 + base_lr_batch_size: 256 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.1 + - name: cosine + start_value: 0.1 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b32.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b32.yaml new file mode 100644 index 000000000..caf5fa4c1 --- /dev/null +++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_b32.yaml @@ -0,0 +1,126 @@ +# @package _global_ +config: + VERBOSE: False + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 4096 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 4096 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: False + FEATURE_EVAL_SETTINGS: + EVAL_MODE_ON: True + FREEZE_TRUNK_ONLY: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 32 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 1000]}], + ] + WEIGHTS_INIT: + PARAMS_FILE: "specify the model weights" + STATE_DICT_KEY_NAME: classy_state_dict + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: sgd + # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4 + weight_decay: 0 + momentum: 0.9 + num_epochs: 100 + nesterov: False + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 0.1 + base_lr_batch_size: 256 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.1 + - name: cosine + start_value: 0.1 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_l16.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_l16.yaml new file mode 100644 index 000000000..0a806c381 --- /dev/null +++ b/configs/config/benchmark/linear_image_classification/imagenet1k/models/vit_l16.yaml @@ -0,0 +1,127 @@ +# @package _global_ +config: + VERBOSE: False + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 1536 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + TEST: + DATA_SOURCES: [disk_folder] + DATA_PATHS: [""] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 1536 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: False + FEATURE_EVAL_SETTINGS: + EVAL_MODE_ON: True + FREEZE_TRUNK_ONLY: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 24 + NUM_HEADS: 16 + HIDDEN_DIM: 1024 + MLP_DIM: 4096 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [1024, 1000]}], + ] + WEIGHTS_INIT: + PARAMS_FILE: "specify the model weights" + STATE_DICT_KEY_NAME: classy_state_dict + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + # USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: sgd + # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4 + weight_decay: 0 + momentum: 0.9 + num_epochs: 100 + nesterov: False + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 0.1 + base_lr_batch_size: 256 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.1 + - name: cosine + start_value: 0.1 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/moco.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/moco.yaml new file mode 100644 index 000000000..feef10df9 --- /dev/null +++ b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/moco.yaml @@ -0,0 +1,20 @@ +# @package _global_ +config: + OPTIMIZER: + name: sgd + weight_decay: 0.00 + momentum: 0.9 + num_epochs: 100 + nesterov: True + regularize_bn: True + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 30.0 + base_lr_batch_size: 256 + name: multistep + values: [30.0, 3.0, 0.3] + milestones: [60, 80] + update_interval: epoch diff --git a/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/vit_high_lr.yaml b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/vit_high_lr.yaml new file mode 100644 index 000000000..828f7e3a2 --- /dev/null +++ b/configs/config/benchmark/linear_image_classification/imagenet1k/optimizers/vit_high_lr.yaml @@ -0,0 +1,28 @@ +# @package _global_ +config: + OPTIMIZER: + name: sgd + # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4 + weight_decay: 0.000001 + momentum: 0.9 + num_epochs: 100 + nesterov: False + regularize_bn: True + regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: true + base_value: 0.3 + base_lr_batch_size: 256 + name: composite + schedulers: + - name: linear + start_value: 0.1 + end_value: 0.3 + - name: cosine + start_value: 0.3 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, .9] diff --git a/configs/config/debugging/pretrain/supervised/supervised_1gpu_vision_transformer_debug_integration_cutmix.yaml b/configs/config/debugging/pretrain/supervised/supervised_1gpu_vision_transformer_debug_integration_cutmix.yaml new file mode 100644 index 000000000..bb0460d17 --- /dev/null +++ b/configs/config/debugging/pretrain/supervised/supervised_1gpu_vision_transformer_debug_integration_cutmix.yaml @@ -0,0 +1,113 @@ +# @package _global_ +config: + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 1 + HOOKS: + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: "." + FLUSH_EVERY_N_MIN: 20 + DATA: + NUM_DATALOADER_WORKERS: 0 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_debug_folder] + BATCHSIZE_PER_REPLICA: 32 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: VisslRandAugment + magnitude: 5 + weight_choice: 0 + - name: ToTensor + - name: RandomErasing + p: 1 + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: cutmixup_collator + COLLATE_FUNCTION_PARAMS: { + 'cutmix_alpha': 1.0, + 'label_smoothing': 0.2 + } + TEST: + DATA_SOURCES: [disk_folder] + # DATA_PATHS: [""] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_debug_folder] + BATCHSIZE_PER_REPLICA: 32 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 32 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["vision_transformer_head", {"in_plane": 768, "hidden_dim": 3072, + "num_classes": 1000}], + ] + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: adamw + weight_decay: 0.3 + num_epochs: 90 + param_schedulers: + lr: + name: composite + schedulers: + - name: linear + start_value: 0.00001 + end_value: 0.003 + - name: cosine + start_value: 0.001 + end_value: 0.000001 + interval_scaling: [rescaled, fixed] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 1 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu + VERBOSE: True + LOG_FREQUENCY: 100 + TEST_ONLY: False + TEST_EVERY_NUM_EPOCH: 1 + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: fork diff --git a/configs/config/debugging/pretrain/supervised/supervised_deit_b_integration_debug.yaml b/configs/config/debugging/pretrain/supervised/supervised_deit_b_integration_debug.yaml new file mode 100644 index 000000000..cb62f7055 --- /dev/null +++ b/configs/config/debugging/pretrain/supervised/supervised_deit_b_integration_debug.yaml @@ -0,0 +1,144 @@ +# @package _global_ +config: + HOOKS: + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 1 + DATA: + NUM_DATALOADER_WORKERS: 0 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_debug_folder] + BATCHSIZE_PER_REPLICA: 32 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + - name: VisslRandAugment + magnitude: 9 + magnitude_std: 0.5 + increasing_severity: True + - name: ColorJitter + brightness: 0.4 + contrast: 0.4 + saturation: 0.4 + hue: 0.4 + - name: ToTensor + - name: RandomErasing + p: 1 + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: cutmixup_collator + COLLATE_FUNCTION_PARAMS: { + "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0. + "cutmix_alpha": 1.0, # cutmix alpha value, cutmix is active if > 0. + "prob": 1.0, # probability of applying mixup or cutmix per batch or element + "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active + "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) + "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders + "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor + "num_classes": 1000 # number of classes for target + } + + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_debug_folder] + BATCHSIZE_PER_REPLICA: 64 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + TRUNK: + NAME: convit + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + CLASSIFIER: token + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + CONVIT: + DROP_PATH_RATE: 0.1 # stochastic depth dropout probability + QKV_BIAS: False # Bias for QKV in attention layers. + QK_SCALE: False # Scale + N_GPSA_LAYERS: 0 # Number of gated positional self-attention layers + CLASS_TOKEN_IN_LOCAL_LAYERS: False # Whether to add class token + # Determines how much the positional attention is focused on the + # patch of maximal attention. "Alpha" in the paper. Equivalent to + # the temperature of positional attention softmax. + LOCALITY_STRENGTH: 1. + # Dimensionality of the relative positional embeddings * 1/3 + LOCALITY_DIM: 10 + # Whether to initialize the positional self-attention to be local + # (equivalent to a convolution) + USE_LOCAL_INIT: True + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 1000]}], + ] + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: adamw + weight_decay: 0.05 + num_epochs: 100 + # We don't want to regularize the position embedding or classification token + non_regularized_parameters: [pos_embedding, class_token] + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.0005 + base_lr_batch_size: 512 + name: composite + schedulers: + - name: linear + start_value: 0.00001 + end_value: 0.0005 + - name: cosine + start_value: 0.0005 + end_value: 0.000001 + interval_scaling: [rescaled, fixed] + update_interval: step + lengths: [0.05, 0.95] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 1 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu + VERBOSE: True + LOG_FREQUENCY: 100 + TEST_ONLY: False + TEST_EVERY_NUM_EPOCH: 1 + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: fork diff --git a/configs/config/debugging/pretrain/swav/swav_integration_debug.yaml b/configs/config/debugging/pretrain/swav/swav_integration_debug.yaml new file mode 100644 index 000000000..fa24d75fa --- /dev/null +++ b/configs/config/debugging/pretrain/swav/swav_integration_debug.yaml @@ -0,0 +1,128 @@ +# @package _global_ +config: + VERBOSE: True + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: False + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + PERF_STAT_FREQUENCY: 10 + ROLLING_BTIME_FREQ: 313 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 0 + TRAIN: + DATA_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_debug_folder] + LABEL_TYPE: zero + BATCHSIZE_PER_REPLICA: 2 + TRANSFORMS: + - name: ImgPilToMultiCrop + total_num_crops: 4 + size_crops: [224, 224] + num_crops: [2, 2] + crop_scales: [[0.14, 1], [0.05, 0.14]] + - name: RandomHorizontalFlip + p: 0.5 + - name: ImgPilColorDistortion + strength: 1.0 + - name: ImgPilGaussianBlur + p: 0.5 + radius_min: 0.1 + radius_max: 2.0 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: cutmixup_collator + MMAP_MODE: True + COPY_TO_LOCAL_DISK: False + COPY_DESTINATION_DIR: /tmp/imagenet1k/ + DROP_LAST: True + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 384 + MLP_DIM: 1536 + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + DROP_PATH_RATE: 0.1 + HYBRID: + HEAD: + PARAMS: [ + ["swav_head", {"dims": [384, 2048, 256], "use_bn": True, "num_clusters": + [3000]}], + ] + TEMP_FROZEN_PARAMS_ITER_MAP: [ + ['module.heads.0.prototypes0.weight', 313], + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: True + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: swav_loss + swav_loss: + temperature: 0.1 + use_double_precision: False + normalize_last_layer: True + num_iters: 3 + epsilon: 0.04 + crops_for_assign: [0, 1] + queue: + queue_length: 3072 + start_iter: 0 + OPTIMIZER: + name: adamw + weight_decay: 0.05 + num_epochs: 300 + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.001 + base_lr_batch_size: 4096 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.001 + - name: cosine + start_value: 0.001 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.03, 0.97] + METERS: + name: "" + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 1 + NUM_PROC_PER_NODE: 1 # 1 GPU + RUN_ID: auto + MACHINE: + DEVICE: gpu diff --git a/configs/config/pretrain/vision_transformer/moco/vit_b16.yaml b/configs/config/pretrain/vision_transformer/moco/vit_b16.yaml new file mode 100644 index 000000000..8c0269f8e --- /dev/null +++ b/configs/config/pretrain/vision_transformer/moco/vit_b16.yaml @@ -0,0 +1,118 @@ +# @package _global_ +config: + VERBOSE: True + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: False + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 128 + LABEL_TYPE: sample_index # just an implementation detail. Label isn't used + TRANSFORMS: + - name: ImgReplicatePil + num_times: 2 + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + p: 0.5 + - name: ImgPilColorDistortion + strength: 1.0 + - name: ImgPilGaussianBlur + p: 0.5 + radius_min: 0.1 + radius_max: 2.0 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: moco_collator + MMAP_MODE: True + COPY_TO_LOCAL_DISK: False + COPY_DESTINATION_DIR: /tmp/imagenet1k/ + DROP_LAST: True + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 2048], "use_relu": True}], + ["mlp", {"dims": [2048, 128]}], + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + # USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: moco_loss + moco_loss: + embedding_dim: 128 + queue_size: 65536 + momentum: 0.999 + temperature: 0.2 + OPTIMIZER: + name: adamw + weight_decay: 0.05 + num_epochs: 300 + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.0005 + base_lr_batch_size: 4096 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.003 + - name: cosine + start_value: 0.003 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: "" + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 4 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: "60215" + MACHINE: + DEVICE: gpu diff --git a/configs/config/pretrain/vision_transformer/simclr/vit_b16.yaml b/configs/config/pretrain/vision_transformer/simclr/vit_b16.yaml new file mode 100644 index 000000000..ae89d8422 --- /dev/null +++ b/configs/config/pretrain/vision_transformer/simclr/vit_b16.yaml @@ -0,0 +1,117 @@ +# @package _global_ +config: + VERBOSE: True + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: False + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: "/checkpoint/ito/vision_transformer/simclr/b16" + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "/checkpoint/ito/vision_transformer/simclr/b16" + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 64 + LABEL_TYPE: sample_index # just an implementation detail. Label isn't used + TRANSFORMS: + - name: ImgReplicatePil + num_times: 2 + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + p: 0.5 + - name: ImgPilColorDistortion + strength: 1.0 + - name: ImgPilGaussianBlur + p: 0.5 + radius_min: 0.1 + radius_max: 2.0 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: simclr_collator + MMAP_MODE: True + COPY_TO_LOCAL_DISK: False + COPY_DESTINATION_DIR: /tmp/imagenet1k/ + DROP_LAST: True + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 3072], "use_relu": True}], + ["mlp", {"dims": [3072, 128]}], + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + # USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: simclr_info_nce_loss + simclr_info_nce_loss: + temperature: 0.1 + buffer_params: + embedding_dim: 128 + OPTIMIZER: + name: adamw + weight_decay: 0.3 + num_epochs: 300 + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.003 + base_lr_batch_size: 4096 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.003 + - name: cosine + start_value: 0.003 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: "" + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 4 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: "60215" + MACHINE: + DEVICE: gpu diff --git a/configs/config/pretrain/vision_transformer/simclr/vit_l16.yaml b/configs/config/pretrain/vision_transformer/simclr/vit_l16.yaml new file mode 100644 index 000000000..c7599cb6d --- /dev/null +++ b/configs/config/pretrain/vision_transformer/simclr/vit_l16.yaml @@ -0,0 +1,120 @@ +# @package _global_ +config: + VERBOSE: True + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: False + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: "/checkpoint/ito/vision_transformer/simclr/l32" + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "/checkpoint/ito/vision_transformer/simclr/l32" + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 5 + TRAIN: + DATA_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 16 + LABEL_TYPE: sample_index # just an implementation detail. Label isn't used + TRANSFORMS: + - name: ImgReplicatePil + num_times: 2 + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + p: 0.5 + - name: ImgPilColorDistortion + strength: 1.0 + - name: ImgPilGaussianBlur + p: 0.5 + radius_min: 0.1 + radius_max: 2.0 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: simclr_collator + MMAP_MODE: True + COPY_TO_LOCAL_DISK: False + COPY_DESTINATION_DIR: /tmp/imagenet1k/ + DROP_LAST: True + MODEL: + GRAD_CLIP: + USE_GRAD_CLIP: True + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 24 + NUM_HEADS: 16 + HIDDEN_DIM: 1024 + MLP_DIM: 4096 + DROPOUT_RATE: 0.1 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + HEAD: + PARAMS: [ + ["mlp", {"dims": [1024, 4096], "use_relu": True}], + ["mlp", {"dims": [4096, 128]}], + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: False + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: simclr_info_nce_loss + simclr_info_nce_loss: + temperature: 0.1 + buffer_params: + embedding_dim: 128 + OPTIMIZER: + name: adamw + weight_decay: 0.3 + # momentum: 0.9 + num_epochs: 300 + # nesterov: True + # regularize_bn: False + # regularize_bias: True + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.003 + base_lr_batch_size: 4096 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.003 + - name: cosine + start_value: 0.003 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.1, 0.9] + METERS: + name: "" + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 4 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: "60215" + MACHINE: + DEVICE: gpu diff --git a/configs/config/pretrain/vision_transformer/supervised/supervised_16gpu_deit_b_example.yaml b/configs/config/pretrain/vision_transformer/supervised/supervised_16gpu_deit_b_example.yaml new file mode 100644 index 000000000..14dca695a --- /dev/null +++ b/configs/config/pretrain/vision_transformer/supervised/supervised_16gpu_deit_b_example.yaml @@ -0,0 +1,130 @@ +# @package _global_ +config: + HOOKS: + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 1 + DATA: + NUM_DATALOADER_WORKERS: 8 + TRAIN: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 32 + TRANSFORMS: + - name: RandomResizedCrop + size: 224 + - name: RandomHorizontalFlip + - name: VisslRandAugment + magnitude: 9 + magnitude_std: 0.5 + increasing_severity: True + - name: ColorJitter + brightness: 0.4 + contrast: 0.4 + saturation: 0.4 + hue: 0.4 + - name: ToTensor + - name: RandomErasing + p: 0.25 + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: cutmixup_collator + COLLATE_FUNCTION_PARAMS: { + "mixup_alpha": 0.8, # mixup alpha value, mixup is active if > 0. + "cutmix_alpha": 1.0, # cutmix alpha value, cutmix is active if > 0. + "prob": 1.0, # probability of applying mixup or cutmix per batch or element + "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active + "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) + "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders + "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor + "num_classes": 1000 # number of classes for target + } + + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 64 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + CLASSIFIER: token + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + DROP_PATH_RATE: 0.1 # stochastic depth dropout probability + HEAD: + PARAMS: [ + ["mlp", {"dims": [768, 1000]}], + ] + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: adamw + weight_decay: 0.05 + num_epochs: 100 + # We don't want to regularize the position embedding or classification token + non_regularized_parameters: [pos_embedding, class_token] + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.0005 + base_lr_batch_size: 512 + name: composite + schedulers: + - name: linear + start_value: 0.00001 + end_value: 0.0005 + - name: cosine + start_value: 0.0005 + end_value: 0.000001 + interval_scaling: [rescaled, fixed] + update_interval: step + lengths: [0.05, 0.95] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 2 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: "60521" + MACHINE: + DEVICE: gpu + VERBOSE: True + LOG_FREQUENCY: 100 + TEST_ONLY: False + TEST_EVERY_NUM_EPOCH: 1 + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: fork diff --git a/configs/config/pretrain/vision_transformer/supervised/supervised_1gpu_vit_example.yaml b/configs/config/pretrain/vision_transformer/supervised/supervised_1gpu_vit_example.yaml new file mode 100644 index 000000000..69f760ef9 --- /dev/null +++ b/configs/config/pretrain/vision_transformer/supervised/supervised_1gpu_vit_example.yaml @@ -0,0 +1,125 @@ +# @package _global_ +config: + VERBOSE: True + LOG_FREQUENCY: 10 + TEST_ONLY: False + TEST_MODEL: True + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: forkserver + HOOKS: + PERF_STATS: + MONITOR_PERF_STATS: True + ROLLING_BTIME_FREQ: 313 + PERF_STAT_FREQUENCY: 10 + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 10 + DATA: + NUM_DATALOADER_WORKERS: 6 + TRAIN: + DATA_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + LABEL_SOURCES: [disk_folder] + LABEL_TYPE: sample_index # just an implementation detail. Label isn't used + BATCHSIZE_PER_REPLICA: 128 + TRANSFORMS: + - name: ColorJitter + brightness: 0.4 + contrast: 0.4 + saturation: 0.4 + hue: 0.4 + - name: RandomResizedCrop + size: 224 + - name: VisslAutoAugment + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + - name: RandomErasing + p: 0.25 + TEST: + DATA_SOURCES: [disk_folder] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_folder] + BATCHSIZE_PER_REPLICA: 256 + TRANSFORMS: + - name: Resize + size: 256 + - name: CenterCrop + size: 224 + - name: ToTensor + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + MODEL: + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + CLASSIFIER: token + DROP_PATH_RATE: 0.1 + HEAD: + PARAMS: [ + ["vision_transformer_head", {"in_plane": 768, "hidden_dim": 3072, + "num_classes": 1000}], + ] + SYNC_BN_CONFIG: + CONVERT_BN_TO_SYNC_BN: True + SYNC_BN_TYPE: apex + GROUP_SIZE: 8 + AMP_PARAMS: + USE_AMP: True + # USE_AMP: True + AMP_ARGS: {"opt_level": "O1"} + LOSS: + name: cross_entropy_multiple_output_single_target + cross_entropy_multiple_output_single_target: + ignore_index: -1 + OPTIMIZER: + name: adamw + weight_decay: 0.05 + num_epochs: 300 + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.0005 + base_lr_batch_size: 1024 + name: composite + schedulers: + - name: linear + start_value: 0.0 + end_value: 0.0005 + - name: cosine + start_value: 0.0005 + end_value: 0 + interval_scaling: [rescaled, rescaled] + update_interval: step + lengths: [0.017, 0.983] + METERS: + name: accuracy_list_meter + accuracy_list_meter: + num_meters: 1 + topk_values: [1, 5] + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 4 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: "60215" + MACHINE: + DEVICE: gpu diff --git a/configs/config/pretrain/vision_transformer/swav/swav_deit_b_2nodes.yaml b/configs/config/pretrain/vision_transformer/swav/swav_deit_b_2nodes.yaml new file mode 100644 index 000000000..3c82d2dfb --- /dev/null +++ b/configs/config/pretrain/vision_transformer/swav/swav_deit_b_2nodes.yaml @@ -0,0 +1,129 @@ +# @package _global_ +config: + HOOKS: + TENSORBOARD_SETUP: + USE_TENSORBOARD: True + EXPERIMENT_LOG_DIR: "/checkpoint/ito/vision_transformer/1gpu_test" + FLUSH_EVERY_N_MIN: 20 + CHECKPOINT: + DIR: "." + AUTO_RESUME: True + CHECKPOINT_FREQUENCY: 1 + DATA: + NUM_DATALOADER_WORKERS: 8 + TRAIN: + DATA_SOURCES: [disk_folder] + # DATA_PATHS: [""] + LABEL_SOURCES: [disk_folder] + DATASET_NAMES: [imagenet1k_debug_folder] + LABEL_TYPE: "zero" + BATCHSIZE_PER_REPLICA: 16 + TRANSFORMS: + - name: ImgPilToMultiCrop + total_num_crops: 2 + size_crops: [224] + num_crops: [2] + crop_scales: [[0.14, 1]] + - name: RandomHorizontalFlip + - name: VisslRandAugment + magnitude: 9 + magnitude_std: 0.5 + increasing_severity: True + - name: ColorJitter + brightness: 0.4 + contrast: 0.4 + saturation: 0.4 + hue: 0.4 + - name: ToTensor + - name: RandomErasing + p: 1 + - name: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + COLLATE_FUNCTION: cutmixup_collator + COLLATE_FUNCTION_PARAMS: { + "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0. + "cutmix_alpha": 1.0, # cutmix alpha value, cutmix is active if > 0. + "prob": 1.0, # probability of applying mixup or cutmix per batch or element + "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active + "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) + "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders + "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor + "num_classes": 1000 # number of classes for target + } + MODEL: + TRUNK: + NAME: vision_transformer + TRUNK_PARAMS: + VISION_TRANSFORMERS: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 16 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + CLASSIFIER: token + DROPOUT_RATE: 0 + ATTENTION_DROPOUT_RATE: 0 + DROP_PATH_RATE: 0.1 # stochastic depth dropout probability + HEAD: + PARAMS: [ + ["swav_head", {"dims": [768, 2048, 128], "use_bn": True, "num_clusters": + [3000]}], + ] + TEMP_FROZEN_PARAMS_ITER_MAP: [ + ['module.heads.0.prototypes0.weight', 313], + ] + LOSS: + name: swav_loss + swav_loss: + temperature: 0.1 + use_double_precision: False + normalize_last_layer: True + num_iters: 3 + epsilon: 0.05 + crops_for_assign: [0, 1] + queue: + queue_length: 0 + start_iter: 0 + OPTIMIZER: + name: adamw + weight_decay: 0.05 + num_epochs: 300 + # We don't want to regularize the position embedding or classification token + non_regularized_parameters: [pos_embedding, class_token] + param_schedulers: + lr: + auto_lr_scaling: + auto_scale: True + base_value: 0.0005 + base_lr_batch_size: 512 + name: composite + schedulers: + - name: linear + start_value: 0.0005 + end_value: 0.0005 + - name: cosine + start_value: 0.0005 + end_value: 0.000001 + interval_scaling: [rescaled, fixed] + update_interval: step + lengths: [0.05, 0.95] + METERS: + name: "" + TRAINER: + TRAIN_STEP_NAME: standard_train_step + DISTRIBUTED: + BACKEND: nccl + NUM_NODES: 2 + NUM_PROC_PER_NODE: 8 # 1 GPU + RUN_ID: "60521" + MACHINE: + DEVICE: gpu + VERBOSE: True + LOG_FREQUENCY: 100 + TEST_ONLY: False + TEST_EVERY_NUM_EPOCH: 1 + TEST_MODEL: False + SEED_VALUE: 0 + MULTI_PROCESSING_METHOD: fork diff --git a/extra_scripts/experiment_spreadsheet_from_logs.py b/extra_scripts/experiment_spreadsheet_from_logs.py new file mode 100644 index 000000000..735704607 --- /dev/null +++ b/extra_scripts/experiment_spreadsheet_from_logs.py @@ -0,0 +1,303 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import argparse +import ast +import collections +import os +import pathlib +import re +import typing + +import pandas as pd + + +def parse_log(log_path: str, args) -> dict: + config = parse_config_from_log(log_path) + # Check to make sure config not empty + if config: + config = flatten(config) + if args.parse_date_time: + date_time = None + try: + date_time = parse_date_time( + config[args.date_time_param], + args.date_time_pattern, + args.date_time_split_char, + ) + except BaseException: + pass + if not date_time: + print("Unable to parse date/time") + date_time = [None, None] + update_config_date_time(config, date_time) + return config + + +def parse_config_from_log(log_path: str) -> dict: + # String prepending beginning of config + config_start_split_on = r"hydra_config.py: \d*: " + # String at start of config + config_start = "{'CHECKPOINT': " + # String on final line of config + config_end = "'VERBOSE': " + config = "" + # Flag to indicate the config portion of the log has been read + config_finished = False + + # World size info from config is not reliable. Use the + # String prepending beginning of world size info + world_size_string = "WORLD_SIZE:" + world_size_btwn = ("WORLD_SIZE:\t", "\n") + world_size = None + + train_losses = [] + train_loss_str = "loss:" + loss_string_btwn = ("loss: ", ";") + + latest_epoch = 0 + epoch_string = "[ep: " + epoch_regex = r"(?<=\[ep: )\d{1,5}(?=\])" + + accuracies = { + "train": {"string": "train_accuracy_list_meter", "values": []}, + "test": {"string": "test_accuracy_list_meter", "values": []}, + } + + with open(log_path) as reader: + store_line = False + # # There are some logs in which the config is printed multiple times. + # # config_read_complete is used to avoid reading more than one config + # # printing. + # config_read_complete = False + for line in reader: + if not store_line: + if world_size_string in line: + world_size = line + if train_loss_str in line: + train_losses.append(line) + for partition in accuracies.keys(): + if accuracies[partition]["string"] in line: + accuracies[partition]["values"].append(line) + if not config_finished: + if config_start in line: + store_line = True + line = re.split(config_start_split_on, line)[1] + if store_line: + config += line + if config_end in line: + store_line = False + config_finished = True + if epoch_string in line: + epoch = re.search(epoch_regex, line) + if epoch: + latest_epoch = int(epoch.group(0)) + + if config: + # Parse into dict + try: + config = ast.literal_eval(config) + config = collections.OrderedDict(config) + except BaseException: + print("Unable to parse dictionary") + config = {} + # Add latest epoch to config + config["latest_epoch"] = latest_epoch + # Parse world size from string + try: + world_size = world_size.split(world_size_btwn[0])[1] + world_size = world_size.split(world_size_btwn[1])[0] + world_size = int(world_size) + # Add to dict + config["WORLD_SIZE"] = world_size + except BaseException: + print("Unable to parse world size") + try: + final_loss = train_losses[-1] + final_loss = final_loss.split(loss_string_btwn[0])[1] + final_loss = final_loss.split(loss_string_btwn[1])[0] + config["final_train_loss"] = final_loss + except BaseException: + print("Unable to parse final training loss") + for partition, partition_contents in accuracies.items(): + if partition_contents["values"]: + try: + final_accuracy_string = partition_contents["values"][-1] + for top_string in ["top_1", "top_5"]: + acc = final_accuracy_string.split("value")[1].split(top_string) + acc = acc[1].split("0: ")[1] + acc = acc.split("}")[0] + param_str = f"final_{partition}_accuracy_{top_string}" + config[param_str] = float(acc) + except BaseException: + print(f"Unable to parse final {partition} accuracy") + else: + print("No information parsed from log file") + config = {} + + return config + + +def flatten(d: collections.abc.MutableMapping, parent_key: str = "", sep: str = "."): + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, collections.abc.MutableMapping): + items.extend(flatten(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return collections.OrderedDict(items) + + +def parse_date_time( + str_to_parse: str = None, pattern: str = None, split_char: str = None +): + instances = re.findall(pattern, str_to_parse) + if instances: + date_time = instances[0].split(split_char) + return date_time + + +def update_config_date_time( + config: collections.OrderedDict, date_time: typing.Union[list, tuple] +): + config["date"] = date_time[0] + config["time"] = date_time[1] + config.move_to_end("time", last=False) + config.move_to_end("date", last=False) + + +def get_latest_checkpoint(directory: pathlib.PosixPath, args: argparse.Namespace): + latest_checkpoint = None + checkpoint_files = list(directory.glob(f"*{args.checkpoint_id_pattern}*")) + if checkpoint_files: + latest_checkpoint = 0 + for checkpoint_file in checkpoint_files: + checkpoint_file = str(checkpoint_file).split("/")[-1] + checkpoint_epoch = re.findall( + args.checkpoint_extract_pattern, checkpoint_file + ) + checkpoint_epoch = int(checkpoint_epoch[0]) + if checkpoint_epoch > latest_checkpoint: + latest_checkpoint = checkpoint_epoch + pass + else: + print("Unable to parse latest checkpoint information") + return latest_checkpoint + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--root_directory", + nargs="*", + type=str, + help="Directory or directories containing experiment " + "run or subdirectories of runs", + ) + parser.add_argument( + "--output_directory", + type=str, + default=os.getcwd(), + help="Where to save output.", + ) + parser.add_argument( + "--output_name", type=str, default="experiments.txt", help="Output filename" + ) + parser.add_argument( + "--parse_date_time", + type=bool, + default=True, + help="Parse date and time from config", + ) + parser.add_argument( + "--date_time_param", + type=str, + default="CHECKPOINT.DIR", + help="config param from whose value the date and time will be parsed", + ) + parser.add_argument( + "--date_time_pattern", + type=str, + default="[0-9]{4}-[0-9][0-9]-[0-9][0-9]/[0-9][0-9]-[0-9][0-9]-[0-9][0-9]", + help="Regex pattern for date and time format", + ) + parser.add_argument( + "--date_time_split_char", + type=str, + default="/", + help="character to split date and time string into " "separate strings", + ) + parser.add_argument( + "--log_file_name_pattern", + type=str, + default="log.txt", + help="pattern to match for log " "file names", + ) + parser.add_argument( + "--parse_checkpoint", + type=bool, + default=True, + help="Parse # training epochs from checkpoint file", + ) + parser.add_argument( + "--checkpoint_id_pattern", + type=str, + default="_phase", + help="pattern to match for " "checkpoint file names", + ) + parser.add_argument( + "--checkpoint_extract_pattern", + type=str, + default=r"phase([0-9]{1,4})\.torch", + help="pattern to extract epoch # from checkpoint file name", + ) + + args = parser.parse_args() + + log_files = [] + for directory in args.root_directory: + log_files.extend( + list(pathlib.Path(directory).rglob(args.log_file_name_pattern)) + ) + + configs_to_concat = [] + for f in log_files: + did_not_add = True + print(f"\nParsing {f}") + config = parse_log(str(f), args) + if args.parse_checkpoint: + last_checkpoint = get_latest_checkpoint(f.parent, args) + config["last_checkpoint_phase"] = last_checkpoint + if args.parse_checkpoint and config["last_checkpoint_phase"]: + configs_to_concat.append(config) + did_not_add = False + elif not args.parse_checkpoint: + configs_to_concat.append(config) + did_not_add = False + if did_not_add: + print(f"Did not add\n{f}\nto file") + if not did_not_add: + print(f"Added \n{f}\nto file") + df = pd.DataFrame(configs_to_concat) + # Sort columns + df = df.reindex(sorted(df.columns), axis=1) + # Move specific columns to beginning. Columns are listed here in reverse + # order. The final item in the list will be the first column. + prepend_columns = [ + "final_train_loss", + "final_test_loss", + "final_train_accuracy_top_1", + "final_train_accuracy_top_5", + "final_test_accuracy_top_1", + "final_test_accuracy_top_5", + "latest_epoch", + "last_checkpoint_phase", + "time", + "date", + ] + for prepend_column in prepend_columns: + if prepend_column in df.columns: + df.insert(0, prepend_column, df.pop(prepend_column)) + output_full_path = os.path.join(args.output_directory, args.output_name) + df.to_csv(output_full_path) + print(f"Saved {output_full_path}") diff --git a/tests/test_mlp.py b/tests/test_mlp.py index 05bd70e22..1e5a226dd 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -49,9 +49,7 @@ def test_mlp_catch_bad_shapes(self): def test_eval_mlp_shape(self): eval_mlp = LinearEvalMLP( - self.MODEL_CONFIG, - in_channels=2048, - dims=[2048 * 2 * 2, 1000], + self.MODEL_CONFIG, in_channels=2048, dims=[2048 * 2 * 2, 1000] ) resnet_feature_map = torch.randn(size=(4, 2048, 2, 2)) diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml index 0e350fbe4..880b75f29 100644 --- a/vissl/config/defaults.yaml +++ b/vissl/config/defaults.yaml @@ -166,10 +166,12 @@ config: DATA_PATHS: [] LABEL_SOURCES: [] LABEL_PATHS: [] - # either standard | sample_index + # either standard | sample_index | zero # sample_index is a common practice in self-supervised learning and sample_index = id of the # sample in the data. # standard label type is used for supervised learning and user specifis the labels to use. + # zero sets all labels to 0, which is necessary when using necessary + # when cutmixup_collator is being used for self-supervised training. LABEL_TYPE: "standard" # whether to memory map the input data. MMAP_MODE: True @@ -180,11 +182,69 @@ config: # number of unique samples in minibatch per gpu (or per device) BATCHSIZE_PER_REPLICA: 256 # list of data transforms to apply on the data + # Example: using RandAugment (https://arxiv.org/abs/1909.13719) + # :param magnitude: integer magnitude of rand augment + # :param magnitude_std: standard deviation of magnitude. If > 0, + # introduces random variability in the augmentation magnitude. + # :param num_layers: integer number of transforms + # :param increasing_severity: boolean that indicates whether to use + # augmentations that increase severity w/ increasing magnitude. Some + # augmentations do this by default. + # :param choice_weights: Index of pre-determined probability distribution + # over augmentations. Currently only one such distribution available (i.e. + # no valid values other than 0 or None), unclear if beneficial. Default = + # None. + # TRANSFORMS: + # - name: VisslRandAugment + # magnitude: 9 + # magnitude_std: 0.5 + # num_layers: 2 + # increasing_severity: True + # + # + # Example: using AutoAugment (https://arxiv.org/abs/1805.09501). This + # autoaugment differs from the torchvision implementation by allowing + # variability in the augmentation intensity. + # ":param policy_name: String. One of 'v0', 'v0r', 'original', 'originalr'. + # One of a set of learned augmentation sequences. + # :param magnitude_std: standard deviation of magnitude. If > 0, introduces + # random variability in the augmentation magnitude. + # TRANSFORMS: + # - name: VisslAutoAugment + # policy_name: v0 + # magnitude_std: 0 TRANSFORMS: [] # collator to use: either pytorch default or user defined custom collator. + # Using the cutmixup_collator in a supervised setting requires the use + # of the cross_entropy_multiple_output_single_target loss (see LOSS + # section below in order to accomodate label-smoothing. Using the + # cutmixup_collator in a self-supervised setting requires setting + # DATA.{TRAIN/TEST}.LABEL_TYPE: zero COLLATE_FUNCTION: "default_collate" # parameters taken by the collator function (if any). COLLATE_FUNCTION_PARAMS: {} + # Example: params for cutmixup_collator to implement CutMix and MixUp + # COLLATE_FUNCTION: "cutmixup_collator" + # COLLATE_FUNCTION_PARAMS: { + # # Adjust collator output to accomodate SSL method. + # # Currently supports "moco" or "simclr". + # # No argument needed if using vissl or supervised. + # "ssl_method": "moco" + # "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0. + # "cutmix_alpha": 0.0, # cutmix alpha value, cutmix is active if > 0. + # "cutmix_minmax": None, # cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None. + # "prob": 1.0, # probability of applying mixup or cutmix per batch or element + # "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active + # "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) + # "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders + # "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor + # "num_classes": 1000 # number of classes for target + # } + # Also note that using the CutMixUp collator in a supervised context + # requires using the cross_entropy_multiple_output_single_target to + # accomodate the smoothed labels. See + # LOSS.cross_entropy_multiple_output_single_target for more information. + # # limit the amount of data used in training. If set to -1, full dataset is used. DATA_LIMIT: -1 # whether the data specified (whether file list or directory) should be copied locally @@ -352,6 +412,14 @@ config: # ] LINEAR_EVAL_FEAT_POOL_OPS_MAP: [] # ----------------------------------------------------------------------------------- # + # GRADIENT CLIPPING. Used by Dosovitskiy et al. in their Vision + # Transformer paper. + # ----------------------------------------------------------------------------------- # + GRAD_CLIP: # See TORCH.NN.UTILS.CLIP_GRAD_NORM_ + USE_GRAD_CLIP: False + NORM_TYPE: 2 # Float, int, or 'inf' + MAX_NORM: 1 + # ----------------------------------------------------------------------------------- # # MODEL TRUNK # ----------------------------------------------------------------------------------- # TRUNK: @@ -364,7 +432,12 @@ config: RESNETS: DEPTH: 50 WIDTH_MULTIPLIER: 1 - NORM: BatchNorm # BatchNorm | LayerNorm + NORM: BatchNorm # BatchNorm | LayerNorm | GroupNorm + # If using GroupNorm, this sets number of groups. Recommend 32 as a + # naive suggestion. GroupNorm only available for ResNe(X)t. + GROUPNORM_GROUPS: 32 + # Use weight-standardized convolutions + STANDARDIZE_CONVOLUTIONS: False GROUPS: 1 ZERO_INIT_RESIDUAL: False WIDTH_PER_GROUP: 64 @@ -383,6 +456,35 @@ config: # RegNet params # ------------------------------------------------------------- # REGNET: {} + + # ------------------------------------------------------------- # + # Vision Transformer/DeiT params. Using a name will + # override/ignore all other VISION_TRANSFORMERS parameters. Named + # options include vit_b_32, vit_b_16, vit_l_32, vit_l_16, vit_h_14. + # Using + # ------------------------------------------------------------- # + VISION_TRANSFORMERS: + name: + IMAGE_SIZE: 224 + PATCH_SIZE: 16 + NUM_LAYERS: 12 + NUM_HEADS: 12 + HIDDEN_DIM: 768 + MLP_DIM: 3072 + # MLP and projection layer dropout rate + DROPOUT_RATE: 0 + # Attention dropout rate + ATTENTION_DROPOUT_RATE: 0 + # Use the token for classification. Currently no alternatives + # supported + CLASSIFIER: token + # Stochastic depth dropout rate. Turning on stochastic depth and + # using aggressive augmentation is essentially the difference + # between a DeiT and a ViT. + DROP_PATH_RATE: 0 + QKV_BIAS: False # Bias for QKV in attention layers. + QK_SCALE: False # Scale + # ----------------------------------------------------------------------------------- # # MODEL HEAD # ----------------------------------------------------------------------------------- # @@ -511,7 +613,16 @@ config: ignore_index: -1 # ----------------------------------------------------------------------------------- # - # Cross-Entropy Loss for multiple input and same target + # Cross-Entropy Loss for multiple outputs and same target. For a single + # output, this is equivalent to the cross-entropy loss. For multiple + # outputs, this computes the sum of the cross-entropy losses for each + # tensor in the list against the target. Can also accomodate target + # vectors in addition to single integer targets, for example when using + # label smoothing. Note that the internally, cross_entropy_multiple_output_single_target + # determines whether each sample is associated with a single target or + # whether each sample is associated with a target vector, and uses vanilla + # CrossEntropyLoss for the single-target case and a custom cross entropy + # function for the multi-target case. # ----------------------------------------------------------------------------------- # cross_entropy_multiple_output_single_target: weight: null @@ -663,10 +774,17 @@ config: nesterov: False # for how many epochs to do training. only counts training epochs. num_epochs: 90 + betas: [.9, .999] # for Adam/AdamW # whether to regularize batch norm. if set to False, weight decay of batch norm params is 0. regularize_bn: False # whether to regularize bias parameter. if set to False, weight decay of bias params is 0. regularize_bias: True + # Parameters to omit from regularization. Any named parameter whose name + # contains any of these strings will be omitted from regularization. + # For example, we don't want to regularize the class token or position + # embeddings in the vision transformer, so we pass: + # non_regularized_parameters: ['class_token', 'pos_embedding'] + non_regularized_parameters: [] # we support using a different LR and weight decay for head and trunk. # one needs to set the flag "use_different_values: True" in order to enable # this functionality. We use the same type of param scheduler for the trunk and head @@ -735,6 +853,7 @@ config: end_value: 0.0 # =====constant learning rate specific ======= value: 0.1 + # ----------------------------------------------------------------------------------- # # CLUSTERFIT APPROACH (https://arxiv.org/abs/1912.03330) # ----------------------------------------------------------------------------------- # diff --git a/vissl/data/collators/cutmixup_collator.py b/vissl/data/collators/cutmixup_collator.py new file mode 100644 index 000000000..4f5bfb394 --- /dev/null +++ b/vissl/data/collators/cutmixup_collator.py @@ -0,0 +1,502 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +This implementation is based on +https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/mixup.py, +published under an Apache License 2.0, with modifications by Matthew Leavitt +(ito@fb.com; matthew.l.leavitt@gmail.com). Modifications are described here and +notated where present in the code. + +Modifications: +- _mix_batch.__call__() now checks device of data its passed, and passes +device argument accordingly. Previous behavior allowed called functions to +default to using cuda, which caused an error when using CPU-based data. + +COMMENT FROM ORIGINAL: +Mixup and Cutmix +Papers: +mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412) +CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899) # NOQA +Code Reference: +CutMix: https://github.com/clovaai/CutMix-PyTorch +Hacked together by / Copyright 2020 Ross Wightman +""" + +import collections.abc as abc +from typing import Any, Dict, Optional + +import numpy as np +import torch +from classy_vision.generic.util import convert_to_one_hot +from torch.distributions.beta import Beta +from vissl.data.collators import register_collator + +from .moco_collator import moco_collator +from .simclr_collator import simclr_collator + + +# TODO: Uncomment in future update when calling via ClassyVision +# from classy_vision.dataset.transforms import mixup as classy_cutmixup + + +# Modification/addition +@register_collator("cutmixup_collator") +def cutmixup_collator(batch, **kwargs): + """ + This collator implements CutMix (https://arxiv.org/abs/1905.04899) and/or + MixUp (https://arxiv.org/abs/1710.09412) via ClassyVision's + implementation (link when publicly available). + + kwargs: + :mixup_alpha (float): mixup alpha value, mixup is active if > 0. + :cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0. + :cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active + and uses this vs alpha if not None. + :prob (float): probability of applying mixup or cutmix per batch or element + :switch_prob (float): probability of switching to cutmix instead of mixup + when both are active + :mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of + elements), 'elem' (element) + :correct_lam (bool): apply lambda correction when cutmix bbox clipped by + image borders + :label_smoothing (float): apply label smoothing to the mixed target tensor + :num_classes (int): number of classes for target + + + The collators collates the batch for the following input (assuming k-copies of image): + + Input: + batch: Example + batch = [ + {"data" : [img1_0, ..., img1_k], ..}, + {"data" : [img2_0, ..., img2_k], ...}, + ... + ] + + Returns: Example output: + output = { + "data": torch.tensor([img1_0, ..., imgN_0], + [img1_k, ..., imgN_k]) .. + } + """ + assert "data" in batch[0], "data not found in sample" + assert "label" in batch[0], "label not found in sample" + + data = [x["data"] for x in batch] + labels = [torch.tensor(x["label"]) for x in batch] + data_valid = [torch.tensor(x["data_valid"]) for x in batch] + data_idx = [torch.tensor(x["data_idx"]) for x in batch] + num_duplicates, num_images = len(data[0]), len(data) + + # Determine ssl method and adjust collator output accordingly + ssl_method = None + if "ssl_method" in kwargs.keys(): + ssl_method = kwargs.pop("ssl_method") + + # Instantiate CutMix + Mixup (CutMixUp!) object + cutmixup_transform_obj = Mixup(**kwargs) + # TODO: Uncomment in future update when calling via ClassyVision + # cutmixup_transform_obj = classy_cutmixup.Mixup(**kwargs) + + output_data, output_label, output_data_valid, output_data_idx = [], [], [], [] + for pos in range(num_duplicates): + cutmixup_data, cutmixup_labels = [], [] + for idx in range(num_images): + cutmixup_data.append(data[idx][pos]) + cutmixup_labels.append(labels[idx][pos]) + output_data_valid.append(data_valid[idx][pos]) + output_data_idx.append(data_idx[idx][pos]) + # Get data and labels into format accepted by Mixup + cutmixup_data = torch.stack(cutmixup_data) + cutmixup_labels = torch.tensor(cutmixup_labels) + cutmixup_output = cutmixup_transform_obj( + {"input": cutmixup_data, "target": cutmixup_labels} + ) + output_data.append(cutmixup_output["input"]) + output_label.append(cutmixup_output["target"]) + + # If using moco or simclr, first restructure the data back into the form + # in which it was originally input, then call the collator for that ssl + # method + if ssl_method == "moco" or ssl_method == "simclr": + output_batch = data_back_to_input_form( + output_data, output_label, output_data_valid, output_data_idx + ) + if ssl_method == "moco": + return moco_collator(output_batch) + elif ssl_method == "simclr": + return simclr_collator(output_batch) + output_batch = { + "data": [output_data], + "label": [torch.cat(output_label)], + "data_valid": [torch.stack(output_data_valid)], + "data_idx": [torch.stack(output_data_idx)], + } + return output_batch + + +# Modification/addition +def data_back_to_input_form(data, labels, data_valid, data_idx): + """ + "De"-collates data back into their form when originally passed. + """ + assert len(data) == len(labels) + assert len(data_idx) == len(data_valid) + data_input_form = [] + num_duplicates, num_images = len(data), len(data[0]) + for sample_i in range(num_images): + sample_input_form = {"data": [], "data_valid": [], "data_idx": [], "label": []} + for duplicate_i in range(num_duplicates): + sample_input_form["data"].append(data[duplicate_i][sample_i]) + sample_input_form["label"].append(labels[duplicate_i][sample_i]) + valid_and_idx_i = sample_i + (num_duplicates * duplicate_i) + sample_input_form["data_idx"].append(data_idx[valid_and_idx_i]) + sample_input_form["data_valid"].append(data_valid[valid_and_idx_i]) + data_input_form.append(sample_input_form) + return data_input_form + + +# TODO: Delete everything from here down in future update when calling via +# ClassyVision +# Everything from here down is copied directly from +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py +# unless otherwise noted. +def _recursive_mixup(sample: Any, permuted_indices: torch.Tensor, coeff: float): + if isinstance(sample, (tuple, list)): + mixed_sample = [] + for s in sample: + mixed_sample.append(_recursive_mixup(s, permuted_indices, coeff)) + + return mixed_sample if isinstance(sample, list) else tuple(mixed_sample) + elif isinstance(sample, abc.Mapping): + mixed_sample = {} + for key, val in sample.items(): + mixed_sample[key] = _recursive_mixup(val, permuted_indices, coeff) + + return mixed_sample + else: + assert torch.is_tensor(sample), "sample is expected to be a pytorch tensor" + # Assume training data is at least 3D tensor (i.e. 1D data). We only + # mixup content data tensor (e.g. video clip, audio spectrogram), and skip + # other tensors, such as frame_idx and timestamp in video clip samples. + if sample.ndim >= 3: + sample = coeff * sample + (1.0 - coeff) * sample[permuted_indices, :] + + return sample + + +class MixupTransform: + """ + This implements the mixup data augmentation in the paper + "mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412) + """ + + def __init__(self, alpha: float, num_classes: Optional[int] = None): + """ + Args: + alpha: the hyperparameter of Beta distribution used to sample mixup + coefficient. + num_classes: number of classes in the dataset. + """ + self.alpha = alpha + self.num_classes = num_classes + + def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]: + """ + Args: + sample: the batch data. + """ + if sample["target"].ndim == 1: + assert self.num_classes is not None, "num_classes is expected for 1D target" + sample["target"] = convert_to_one_hot( + sample["target"].view(-1, 1), self.num_classes + ) + else: + assert sample["target"].ndim == 2, "target tensor shape must be 1D or 2D" + + c = Beta(self.alpha, self.alpha).sample().to(device=sample["target"].device) + permuted_indices = torch.randperm(sample["target"].shape[0]) + + sample["target"] = ( + c * sample["target"] + (1.0 - c) * sample["target"][permuted_indices, :] + ) + sample["input"] = _recursive_mixup(sample["input"], permuted_indices, c) + + return sample + + +def one_hot(x, num_classes, on_value=1.0, off_value=0.0, device="cuda"): + x = x.long().view(-1, 1) + return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_( + 1, x, on_value + ) + + +def mixup_target(target, num_classes, lam=1.0, smoothing=0.0, device="cuda"): + off_value = smoothing / num_classes + on_value = 1.0 - smoothing + off_value + y1 = one_hot( + target, num_classes, on_value=on_value, off_value=off_value, device=device + ) + y2 = one_hot( + target.flip(0), + num_classes, + on_value=on_value, + off_value=off_value, + device=device, + ) + return y1 * lam + y2 * (1.0 - lam) + + +def rand_bbox(img_shape, lam, margin=0.0, count=None): + """Standard CutMix bounding-box + Generates a random square bbox based on lambda value. This impl includes + support for enforcing a border margin as percent of bbox dimensions. + Args: + img_shape (tuple): Image shape as tuple + lam (float): Cutmix lambda value + margin (float): Percentage of bbox dimension to enforce as margin + (reduce amount of box outside image) + count (int): Number of bbox to generate + """ + ratio = np.sqrt(1 - lam) + img_h, img_w = img_shape[-2:] + cut_h, cut_w = int(img_h * ratio), int(img_w * ratio) + margin_y, margin_x = int(margin * cut_h), int(margin * cut_w) + cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count) + cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count) + yl = np.clip(cy - cut_h // 2, 0, img_h) + yh = np.clip(cy + cut_h // 2, 0, img_h) + xl = np.clip(cx - cut_w // 2, 0, img_w) + xh = np.clip(cx + cut_w // 2, 0, img_w) + return yl, yh, xl, xh + + +def rand_bbox_minmax(img_shape, minmax, count=None): + """Min-Max CutMix bounding-box + Inspired by Darknet cutmix impl, generates a random rectangular bbox + based on min/max percent values applied to each dimension of the input image. + Typical defaults for minmax are usually in the .2-.3 for min and .8-.9 + range for max. + Args: + img_shape (tuple): Image shape as tuple + minmax (tuple or list): Min and max bbox ratios (as percent of image + size) + count (int): Number of bbox to generate + """ + assert len(minmax) == 2 + img_h, img_w = img_shape[-2:] + cut_h = np.random.randint( + int(img_h * minmax[0]), int(img_h * minmax[1]), size=count + ) + cut_w = np.random.randint( + int(img_w * minmax[0]), int(img_w * minmax[1]), size=count + ) + yl = np.random.randint(0, img_h - cut_h, size=count) + xl = np.random.randint(0, img_w - cut_w, size=count) + yu = yl + cut_h + xu = xl + cut_w + return yl, yu, xl, xu + + +def cutmix_bbox_and_lam( + img_shape, lam, ratio_minmax=None, correct_lam=True, count=None +): + """Generate bbox and apply lambda correction.""" + if ratio_minmax is not None: + yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count) + else: + yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count) + if correct_lam or ratio_minmax is not None: + bbox_area = (yu - yl) * (xu - xl) + lam = 1.0 - bbox_area / float(img_shape[-2] * img_shape[-1]) + return (yl, yu, xl, xu), lam + + +class Mixup: + """Mixup/Cutmix that applies different params to each element or whole batch + Args: + mixup_alpha (float): mixup alpha value, mixup is active if > 0. + cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0. + cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is + active and uses this vs alpha if not None. + prob (float): probability of applying mixup or cutmix per batch or + element + switch_prob (float): probability of switching to cutmix instead of + mixup when both are active + mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair + of elements), 'elem' (element) + correct_lam (bool): apply lambda correction when cutmix bbox clipped by + image borders + label_smoothing (float): apply label smoothing to the mixed target + tensor + num_classes (int): number of classes for target + """ + + def __init__( + self, + mixup_alpha=1.0, + cutmix_alpha=0.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode="batch", + correct_lam=True, + label_smoothing=0.1, + num_classes=1000, + ): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if self.cutmix_minmax is not None: + assert len(self.cutmix_minmax) == 2 + # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = ( + correct_lam # correct lambda based on clipped area for cutmix + ) + self.mixup_enabled = ( + True # set to false to disable mixing (intended tp be set by train loop) + ) + + def _params_per_elem(self, batch_size): + lam = np.ones(batch_size, dtype=np.float32) + use_cutmix = np.zeros(batch_size, dtype=np.bool) + if self.mixup_enabled: + if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0: + use_cutmix = np.random.rand(batch_size) < self.switch_prob + lam_mix = np.where( + use_cutmix, + np.random.beta( + self.cutmix_alpha, self.cutmix_alpha, size=batch_size + ), + np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size), + ) + elif self.mixup_alpha > 0.0: + lam_mix = np.random.beta( + self.mixup_alpha, self.mixup_alpha, size=batch_size + ) + elif self.cutmix_alpha > 0.0: + use_cutmix = np.ones(batch_size, dtype=np.bool) + lam_mix = np.random.beta( + self.cutmix_alpha, self.cutmix_alpha, size=batch_size + ) + else: + assert AssertionError, ( + "One of mixup_alpha > 0., cutmix_alpha > 0.," + "cutmix_minmax not None should be true." + ) + lam = np.where( + np.random.rand(batch_size) < self.mix_prob, + lam_mix.astype(np.float32), + lam, + ) + return lam, use_cutmix + + def _params_per_batch(self): + lam = 1.0 + use_cutmix = False + if self.mixup_enabled and np.random.rand() < self.mix_prob: + if self.mixup_alpha > 0.0 and self.cutmix_alpha > 0.0: + use_cutmix = np.random.rand() < self.switch_prob + lam_mix = ( + np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + if use_cutmix + else np.random.beta(self.mixup_alpha, self.mixup_alpha) + ) + elif self.mixup_alpha > 0.0: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + elif self.cutmix_alpha > 0.0: + use_cutmix = True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + else: + assert AssertionError, ( + "One of mixup_alpha > 0., cutmix_alpha > 0.," + "cutmix_minmax not None should be true." + ) + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_elem(self, x): + batch_size = len(x) + lam_batch, use_cutmix = self._params_per_elem(batch_size) + x_orig = x.clone() # need to keep an unmodified original for mixing source + for i in range(batch_size): + j = batch_size - i - 1 + lam = lam_batch[i] + if lam != 1.0: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x[i].shape, + lam, + ratio_minmax=self.cutmix_minmax, + correct_lam=self.correct_lam, + ) + x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + x[i] = x[i] * lam + x_orig[j] * (1 - lam) + return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) + + def _mix_pair(self, x): + batch_size = len(x) + lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) + x_orig = x.clone() # need to keep an unmodified original for mixing source + for i in range(batch_size // 2): + j = batch_size - i - 1 + lam = lam_batch[i] + if lam != 1.0: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x[i].shape, + lam, + ratio_minmax=self.cutmix_minmax, + correct_lam=self.correct_lam, + ) + x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] + x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + x[i] = x[i] * lam + x_orig[j] * (1 - lam) + x[j] = x[j] * lam + x_orig[i] * (1 - lam) + lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) + return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) + + def _mix_batch(self, x): + lam, use_cutmix = self._params_per_batch() + if lam == 1.0: + return 1.0 + if use_cutmix: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x.shape, + lam, + ratio_minmax=self.cutmix_minmax, + correct_lam=self.correct_lam, + ) + x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh] + else: + x_flipped = x.flip(0).mul_(1.0 - lam) + x.mul_(lam).add_(x_flipped) + return lam + + def __call__(self, sample): + x = sample["input"] + target = sample["target"] + assert len(x) % 2 == 0, "Batch size should be even when using this" + if self.mode == "elem": + lam = self._mix_elem(x) + elif self.mode == "pair": + lam = self._mix_pair(x) + else: + lam = self._mix_batch(x) + # Modified to pass device argument based on target.device to prevent + # failure on CPU-based data. + target = mixup_target( + target, self.num_classes, lam, self.label_smoothing, device=target.device + ) + return {"input": x, "target": target} diff --git a/vissl/data/ssl_dataset.py b/vissl/data/ssl_dataset.py index b35a631c5..2e966b9db 100644 --- a/vissl/data/ssl_dataset.py +++ b/vissl/data/ssl_dataset.py @@ -235,6 +235,15 @@ def __getitem__(self, idx): item["data_idx"].append(idx) item["data_valid"].append(1 if valid else -1) + # There are three types of label_type (data labels): "standard", + # "sample_index", and "zero". "standard" uses the labels associated + # with a data set (e.g. directory names). "sample_index" assigns each + # sample a label that corresponds to that sample's index in the + # dataset (first sample will have label == 0, etc.), and is used for + # SSL tasks in which the label is arbitrary. "zero" assigns + # each sample the label == 0, which is necessary when using the + # CutMixUp collator because of the label smoothing that is built in + # to its functionality. if (len(self.label_objs) > 0) or self.label_type == "standard": item["label"] = [] for source in self.label_objs: @@ -247,6 +256,10 @@ def __getitem__(self, idx): item["label"] = [] for _ in range(len(self.data_objs)): item["label"].append(idx) + elif self.label_type == "zero": + item["label"] = [] + for _ in range(len(self.data_objs)): + item["label"].append(0) else: raise ValueError(f"Unknown label type: {self.label_type}") diff --git a/vissl/data/ssl_transforms/rand_auto_aug.py b/vissl/data/ssl_transforms/rand_auto_aug.py new file mode 100644 index 000000000..00b49f3f4 --- /dev/null +++ b/vissl/data/ssl_transforms/rand_auto_aug.py @@ -0,0 +1,721 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +This implementation is based on +https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py, +pulished under an Apache License 2.0, with modifications by Matthew Leavitt ( +ito@fb.com; matthew.l.leavitt@gmail.com). Modifications are described here and +notated where present in the code. + +Modifications: +-Removed AugMix functionality. +-Replaced AutoAugment and RandAugment classes, which are no longer passed a +single parameter string that needs to be parsed, but instead individual, +named parameters. + +COMMENT FROM ORIGINAL: +AutoAugment, RandAugment, and AugMix for PyTorch +This code implements the searched ImageNet policies with various tweaks and +improvements and does not include any of the search code. AA and RA +Implementation adapted from: + https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py +AugMix adapted from: + https://github.com/google-research/augmix +Papers: + AutoAugment: Learning Augmentation Policies from Data + https://arxiv.org/abs/1805.09501 + Learning Data Augmentation Strategies for Object Detection + https://arxiv.org/abs/1906.11172 + RandAugment: Practical automated data augmentation... + https://arxiv.org/abs/1909.13719 + AugMix: A Simple Data Processing Method to Improve Robustness and + Uncertainty https://arxiv.org/abs/1912.02781 + +Hacked together by / Copyright 2020 Ross Wightman +""" +import math +import random +import re + +import numpy as np +import PIL +from classy_vision.dataset.transforms import register_transform +from classy_vision.dataset.transforms.classy_transform import ClassyTransform +from PIL import Image, ImageEnhance, ImageOps + + +# TODO: Uncomment in future update when calling via ClassyVision +# from classy_vision.dataset.transforms.timm_autoaugment import \ +# _RAND_TRANSFORMS, _RAND_INCREASING_TRANSFORMS, rand_augment_ops, \ +# _HPARAMS_DEFAULT, _select_rand_weights, auto_augment_policy + + +# TODO: Delete in future update when calling via ClassyVision +_PIL_VER = tuple(int(x) for x in PIL.__version__.split(".")[:2]) + +# TODO: Delete in future update when calling via ClassyVision +_FILL = (128, 128, 128) + +# TODO: Delete in future update when calling via ClassyVision +# This signifies the max integer that the controller RNN could predict for the +# augmentation scheme. +_MAX_LEVEL = 10.0 + +# TODO: Delete in future update when calling via ClassyVision +_HPARAMS_DEFAULT = {"translate_const": 250, "img_mean": _FILL} + +# TODO: Delete in future update when calling via ClassyVision +_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) + + +# Modification/Addition +@register_transform("VisslRandAugment") +class RandAugment(ClassyTransform): + """ + Create a RandAugment transform. + :param magnitude: integer magnitude of rand augment + :param magnitude_std: standard deviation of magnitude. If > 0, introduces + random variability in the augmentation magnitude. + :param num_layers: integer number of transforms + :param increasing_severity: boolean that indicates whether to use + augmentations that increase severity w/ increasing magnitude. Some + augmentations do this by default. + :param weight_choice: Index of pre-determined probability distribution + over augmentations. Currently only one such distribution available (i.e. + no valid values other than 0 or None), unclear if beneficial. Default = + None. + """ + + def __init__( + self, + magnitude=10, + magnitude_std=0, + num_layers=2, + increasing_severity=False, + weight_choice=None, + **kwargs + ): + hparams = kwargs + hparams.update(_HPARAMS_DEFAULT) + hparams["magnitude_std"] = magnitude_std + if increasing_severity: + transforms = _RAND_INCREASING_TRANSFORMS + else: + transforms = _RAND_TRANSFORMS + self.num_layers = num_layers + self.choice_weights = ( + None if weight_choice is None else _select_rand_weights(weight_choice) + ) + self.ops = rand_augment_ops( + magnitude=magnitude, hparams=hparams, transforms=transforms + ) + + def __call__(self, img): + # no replacement when using weighted choice + ops = np.random.choice( + self.ops, + self.num_layers, + replace=self.choice_weights is None, + p=self.choice_weights, + ) + for op in ops: + img = op(img) + return img + + +# Modification/Addition +@register_transform("VisslAutoAugment") +class AutoAugment(ClassyTransform): + """ + Create a AutoAugment transform. This autoaugment differs from the + torchvision implementation by allowing variability in the augmentation + intensity. + ":param policy_name: String. One of 'v0', 'v0r', 'original', 'originalr'. + One of a set of learned augmentation sequences. + :param magnitude_std: standard deviation of magnitude. If > 0, introduces + random variability in the augmentation magnitude. + :kwargs: Other params for the AutoAugmentation scheme. See RandAugment + class above, or AugmentOp class in ClassyVision. Probability and + intensity are overwritten because they're determined by the learned + AutoAugment policy. + """ + + def __init__(self, policy_name="v0", magnitude_std=0, **kwargs): + hparams = kwargs + hparams.update(_HPARAMS_DEFAULT) + hparams["magnitude_std"] = magnitude_std + self.policy = auto_augment_policy(policy_name, hparams=hparams) + + def __call__(self, img): + sub_policy = random.choice(self.policy) + for op in sub_policy: + img = op(img) + return img + + +# TODO: Delete everything from here down in future update when calling via +# ClassyVision +# Everything from here down is copied directly from +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py +def _interpolation(kwargs): + interpolation = kwargs.pop("resample", Image.BILINEAR) + if isinstance(interpolation, (list, tuple)): + return random.choice(interpolation) + else: + return interpolation + + +def _check_args_tf(kwargs): + if "fillcolor" in kwargs and _PIL_VER < (5, 0): + kwargs.pop("fillcolor") + kwargs["resample"] = _interpolation(kwargs) + + +def shear_x(img, factor, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs) + + +def shear_y(img, factor, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs) + + +def translate_x_rel(img, pct, **kwargs): + pixels = pct * img.size[0] + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs) + + +def translate_y_rel(img, pct, **kwargs): + pixels = pct * img.size[1] + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs) + + +def translate_x_abs(img, pixels, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs) + + +def translate_y_abs(img, pixels, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs) + + +def rotate(img, degrees, **kwargs): + _check_args_tf(kwargs) + if _PIL_VER >= (5, 2): + return img.rotate(degrees, **kwargs) + elif _PIL_VER >= (5, 0): + w, h = img.size + post_trans = (0, 0) + rotn_center = (w / 2.0, h / 2.0) + angle = -math.radians(degrees) + matrix = [ + round(math.cos(angle), 15), + round(math.sin(angle), 15), + 0.0, + round(-math.sin(angle), 15), + round(math.cos(angle), 15), + 0.0, + ] + + def transform(x, y, matrix): + (a, b, c, d, e, f) = matrix + return a * x + b * y + c, d * x + e * y + f + + matrix[2], matrix[5] = transform( + -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix + ) + matrix[2] += rotn_center[0] + matrix[5] += rotn_center[1] + return img.transform(img.size, Image.AFFINE, matrix, **kwargs) + else: + return img.rotate(degrees, resample=kwargs["resample"]) + + +def auto_contrast(img, **__): + return ImageOps.autocontrast(img) + + +def invert(img, **__): + return ImageOps.invert(img) + + +def equalize(img, **__): + return ImageOps.equalize(img) + + +def solarize(img, thresh, **__): + return ImageOps.solarize(img, thresh) + + +def solarize_add(img, add, thresh=128, **__): + lut = [] + for i in range(256): + if i < thresh: + lut.append(min(255, i + add)) + else: + lut.append(i) + if img.mode in ("L", "RGB"): + if img.mode == "RGB" and len(lut) == 256: + lut = lut + lut + lut + return img.point(lut) + else: + return img + + +def posterize(img, bits_to_keep, **__): + if bits_to_keep >= 8: + return img + return ImageOps.posterize(img, bits_to_keep) + + +def contrast(img, factor, **__): + return ImageEnhance.Contrast(img).enhance(factor) + + +def color(img, factor, **__): + return ImageEnhance.Color(img).enhance(factor) + + +def brightness(img, factor, **__): + return ImageEnhance.Brightness(img).enhance(factor) + + +def sharpness(img, factor, **__): + return ImageEnhance.Sharpness(img).enhance(factor) + + +def _randomly_negate(v): + """With 50% prob, negate the value""" + return -v if random.random() > 0.5 else v + + +def _rotate_level_to_arg(level, _hparams): + # range [-30, 30] + level = (level / _MAX_LEVEL) * 30.0 + level = _randomly_negate(level) + return (level,) + + +def _enhance_level_to_arg(level, _hparams): + # range [0.1, 1.9] + return ((level / _MAX_LEVEL) * 1.8 + 0.1,) + + +def _enhance_increasing_level_to_arg(level, _hparams): + # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 + # increases the enhancement blend range [0.1, 1.9] + level = (level / _MAX_LEVEL) * 0.9 + level = 1.0 + _randomly_negate(level) + return (level,) + + +def _shear_level_to_arg(level, _hparams): + # range [-0.3, 0.3] + level = (level / _MAX_LEVEL) * 0.3 + level = _randomly_negate(level) + return (level,) + + +def _translate_abs_level_to_arg(level, hparams): + translate_const = hparams["translate_const"] + level = (level / _MAX_LEVEL) * float(translate_const) + level = _randomly_negate(level) + return (level,) + + +def _translate_rel_level_to_arg(level, hparams): + # default range [-0.45, 0.45] + translate_pct = hparams.get("translate_pct", 0.45) + level = (level / _MAX_LEVEL) * translate_pct + level = _randomly_negate(level) + return (level,) + + +def _posterize_level_to_arg(level, _hparams): + # As per Tensorflow TPU EfficientNet impl + # range [0, 4], 'keep 0 up to 4 MSB of original image' + # intensity/severity of augmentation decreases with level + return (int((level / _MAX_LEVEL) * 4),) + + +def _posterize_increasing_level_to_arg(level, hparams): + # As per Tensorflow models research and UDA impl + # range [4, 0], 'keep 4 down to 0 MSB of original image', + # intensity/severity of augmentation increases with level + return (4 - _posterize_level_to_arg(level, hparams)[0],) + + +def _posterize_original_level_to_arg(level, _hparams): + # As per original AutoAugment paper description + # range [4, 8], 'keep 4 up to 8 MSB of image' + # intensity/severity of augmentation decreases with level + return (int((level / _MAX_LEVEL) * 4) + 4,) + + +def _solarize_level_to_arg(level, _hparams): + # range [0, 256] + # intensity/severity of augmentation decreases with level + return (int((level / _MAX_LEVEL) * 256),) + + +def _solarize_increasing_level_to_arg(level, _hparams): + # range [0, 256] + # intensity/severity of augmentation increases with level + return (256 - _solarize_level_to_arg(level, _hparams)[0],) + + +def _solarize_add_level_to_arg(level, _hparams): + # range [0, 110] + return (int((level / _MAX_LEVEL) * 110),) + + +LEVEL_TO_ARG = { + "AutoContrast": None, + "Equalize": None, + "Invert": None, + "Rotate": _rotate_level_to_arg, + # There are several variations of the posterize level scaling in various + # Tensorflow/Google repositories/papers + "Posterize": _posterize_level_to_arg, + "PosterizeIncreasing": _posterize_increasing_level_to_arg, + "PosterizeOriginal": _posterize_original_level_to_arg, + "Solarize": _solarize_level_to_arg, + "SolarizeIncreasing": _solarize_increasing_level_to_arg, + "SolarizeAdd": _solarize_add_level_to_arg, + "Color": _enhance_level_to_arg, + "ColorIncreasing": _enhance_increasing_level_to_arg, + "Contrast": _enhance_level_to_arg, + "ContrastIncreasing": _enhance_increasing_level_to_arg, + "Brightness": _enhance_level_to_arg, + "BrightnessIncreasing": _enhance_increasing_level_to_arg, + "Sharpness": _enhance_level_to_arg, + "SharpnessIncreasing": _enhance_increasing_level_to_arg, + "ShearX": _shear_level_to_arg, + "ShearY": _shear_level_to_arg, + "TranslateX": _translate_abs_level_to_arg, + "TranslateY": _translate_abs_level_to_arg, + "TranslateXRel": _translate_rel_level_to_arg, + "TranslateYRel": _translate_rel_level_to_arg, +} + + +NAME_TO_OP = { + "AutoContrast": auto_contrast, + "Equalize": equalize, + "Invert": invert, + "Rotate": rotate, + "Posterize": posterize, + "PosterizeIncreasing": posterize, + "PosterizeOriginal": posterize, + "Solarize": solarize, + "SolarizeIncreasing": solarize, + "SolarizeAdd": solarize_add, + "Color": color, + "ColorIncreasing": color, + "Contrast": contrast, + "ContrastIncreasing": contrast, + "Brightness": brightness, + "BrightnessIncreasing": brightness, + "Sharpness": sharpness, + "SharpnessIncreasing": sharpness, + "ShearX": shear_x, + "ShearY": shear_y, + "TranslateX": translate_x_abs, + "TranslateY": translate_y_abs, + "TranslateXRel": translate_x_rel, + "TranslateYRel": translate_y_rel, +} + + +class AugmentOp: + def __init__(self, name, prob=0.5, magnitude=10, hparams=None): + hparams = hparams or _HPARAMS_DEFAULT + self.aug_fn = NAME_TO_OP[name] + self.level_fn = LEVEL_TO_ARG[name] + self.prob = prob + self.magnitude = magnitude + self.hparams = hparams.copy() + self.kwargs = { + "fillcolor": hparams["img_mean"] if "img_mean" in hparams else _FILL, + "resample": hparams["interpolation"] + if "interpolation" in hparams + else _RANDOM_INTERPOLATION, + } + + # If magnitude_std is > 0, we introduce some randomness + # in the usually fixed policy and sample magnitude from a normal distribution + # with mean `magnitude` and std-dev of `magnitude_std`. + self.magnitude_std = self.hparams.get("magnitude_std", 0) + + def __call__(self, img): + if self.prob < 1.0 and random.random() > self.prob: + return img + magnitude = self.magnitude + if self.magnitude_std and self.magnitude_std > 0: + magnitude = random.gauss(magnitude, self.magnitude_std) + magnitude = min(_MAX_LEVEL, max(0, magnitude)) # clip to valid range + level_args = ( + self.level_fn(magnitude, self.hparams) if self.level_fn is not None else () + ) + return self.aug_fn(img, *level_args, **self.kwargs) + + +def auto_augment_policy_v0(hparams): + # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference. + policy = [ + [("Equalize", 0.8, 1), ("ShearY", 0.8, 4)], + [("Color", 0.4, 9), ("Equalize", 0.6, 3)], + [("Color", 0.4, 1), ("Rotate", 0.6, 8)], + [("Solarize", 0.8, 3), ("Equalize", 0.4, 7)], + [("Solarize", 0.4, 2), ("Solarize", 0.6, 2)], + [("Color", 0.2, 0), ("Equalize", 0.8, 8)], + [("Equalize", 0.4, 8), ("SolarizeAdd", 0.8, 3)], + [("ShearX", 0.2, 9), ("Rotate", 0.6, 8)], + [("Color", 0.6, 1), ("Equalize", 1.0, 2)], + [("Invert", 0.4, 9), ("Rotate", 0.6, 0)], + [("Equalize", 1.0, 9), ("ShearY", 0.6, 3)], + [("Color", 0.4, 7), ("Equalize", 0.6, 0)], + [("Posterize", 0.4, 6), ("AutoContrast", 0.4, 7)], + [("Solarize", 0.6, 8), ("Color", 0.6, 9)], + [("Solarize", 0.2, 4), ("Rotate", 0.8, 9)], + [("Rotate", 1.0, 7), ("TranslateYRel", 0.8, 9)], + [("ShearX", 0.0, 0), ("Solarize", 0.8, 4)], + [("ShearY", 0.8, 0), ("Color", 0.6, 4)], + [("Color", 1.0, 0), ("Rotate", 0.6, 2)], + [("Equalize", 0.8, 4), ("Equalize", 0.0, 8)], + [("Equalize", 1.0, 4), ("AutoContrast", 0.6, 2)], + [("ShearY", 0.4, 7), ("SolarizeAdd", 0.6, 7)], + [ + ("Posterize", 0.8, 2), + ("Solarize", 0.6, 10), + ], # This results in black image with Tpu posterize + [("Solarize", 0.6, 8), ("Equalize", 0.6, 1)], + [("Color", 0.8, 6), ("Rotate", 0.4, 5)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy_v0r(hparams): + # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used + # in Google research implementation (number of bits discarded increases with magnitude) + policy = [ + [("Equalize", 0.8, 1), ("ShearY", 0.8, 4)], + [("Color", 0.4, 9), ("Equalize", 0.6, 3)], + [("Color", 0.4, 1), ("Rotate", 0.6, 8)], + [("Solarize", 0.8, 3), ("Equalize", 0.4, 7)], + [("Solarize", 0.4, 2), ("Solarize", 0.6, 2)], + [("Color", 0.2, 0), ("Equalize", 0.8, 8)], + [("Equalize", 0.4, 8), ("SolarizeAdd", 0.8, 3)], + [("ShearX", 0.2, 9), ("Rotate", 0.6, 8)], + [("Color", 0.6, 1), ("Equalize", 1.0, 2)], + [("Invert", 0.4, 9), ("Rotate", 0.6, 0)], + [("Equalize", 1.0, 9), ("ShearY", 0.6, 3)], + [("Color", 0.4, 7), ("Equalize", 0.6, 0)], + [("PosterizeIncreasing", 0.4, 6), ("AutoContrast", 0.4, 7)], + [("Solarize", 0.6, 8), ("Color", 0.6, 9)], + [("Solarize", 0.2, 4), ("Rotate", 0.8, 9)], + [("Rotate", 1.0, 7), ("TranslateYRel", 0.8, 9)], + [("ShearX", 0.0, 0), ("Solarize", 0.8, 4)], + [("ShearY", 0.8, 0), ("Color", 0.6, 4)], + [("Color", 1.0, 0), ("Rotate", 0.6, 2)], + [("Equalize", 0.8, 4), ("Equalize", 0.0, 8)], + [("Equalize", 1.0, 4), ("AutoContrast", 0.6, 2)], + [("ShearY", 0.4, 7), ("SolarizeAdd", 0.6, 7)], + [("PosterizeIncreasing", 0.8, 2), ("Solarize", 0.6, 10)], + [("Solarize", 0.6, 8), ("Equalize", 0.6, 1)], + [("Color", 0.8, 6), ("Rotate", 0.4, 5)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy_original(hparams): + # ImageNet policy from https://arxiv.org/abs/1805.09501 + policy = [ + [("PosterizeOriginal", 0.4, 8), ("Rotate", 0.6, 9)], + [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)], + [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)], + [("PosterizeOriginal", 0.6, 7), ("PosterizeOriginal", 0.6, 6)], + [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)], + [("Equalize", 0.4, 4), ("Rotate", 0.8, 8)], + [("Solarize", 0.6, 3), ("Equalize", 0.6, 7)], + [("PosterizeOriginal", 0.8, 5), ("Equalize", 1.0, 2)], + [("Rotate", 0.2, 3), ("Solarize", 0.6, 8)], + [("Equalize", 0.6, 8), ("PosterizeOriginal", 0.4, 6)], + [("Rotate", 0.8, 8), ("Color", 0.4, 0)], + [("Rotate", 0.4, 9), ("Equalize", 0.6, 2)], + [("Equalize", 0.0, 7), ("Equalize", 0.8, 8)], + [("Invert", 0.6, 4), ("Equalize", 1.0, 8)], + [("Color", 0.6, 4), ("Contrast", 1.0, 8)], + [("Rotate", 0.8, 8), ("Color", 1.0, 2)], + [("Color", 0.8, 8), ("Solarize", 0.8, 7)], + [("Sharpness", 0.4, 7), ("Invert", 0.6, 8)], + [("ShearX", 0.6, 5), ("Equalize", 1.0, 9)], + [("Color", 0.4, 0), ("Equalize", 0.6, 3)], + [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)], + [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)], + [("Invert", 0.6, 4), ("Equalize", 1.0, 8)], + [("Color", 0.6, 4), ("Contrast", 1.0, 8)], + [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy_originalr(hparams): + # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation + policy = [ + [("PosterizeIncreasing", 0.4, 8), ("Rotate", 0.6, 9)], + [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)], + [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)], + [("PosterizeIncreasing", 0.6, 7), ("PosterizeIncreasing", 0.6, 6)], + [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)], + [("Equalize", 0.4, 4), ("Rotate", 0.8, 8)], + [("Solarize", 0.6, 3), ("Equalize", 0.6, 7)], + [("PosterizeIncreasing", 0.8, 5), ("Equalize", 1.0, 2)], + [("Rotate", 0.2, 3), ("Solarize", 0.6, 8)], + [("Equalize", 0.6, 8), ("PosterizeIncreasing", 0.4, 6)], + [("Rotate", 0.8, 8), ("Color", 0.4, 0)], + [("Rotate", 0.4, 9), ("Equalize", 0.6, 2)], + [("Equalize", 0.0, 7), ("Equalize", 0.8, 8)], + [("Invert", 0.6, 4), ("Equalize", 1.0, 8)], + [("Color", 0.6, 4), ("Contrast", 1.0, 8)], + [("Rotate", 0.8, 8), ("Color", 1.0, 2)], + [("Color", 0.8, 8), ("Solarize", 0.8, 7)], + [("Sharpness", 0.4, 7), ("Invert", 0.6, 8)], + [("ShearX", 0.6, 5), ("Equalize", 1.0, 9)], + [("Color", 0.4, 0), ("Equalize", 0.6, 3)], + [("Equalize", 0.4, 7), ("Solarize", 0.2, 4)], + [("Solarize", 0.6, 5), ("AutoContrast", 0.6, 5)], + [("Invert", 0.6, 4), ("Equalize", 1.0, 8)], + [("Color", 0.6, 4), ("Contrast", 1.0, 8)], + [("Equalize", 0.8, 8), ("Equalize", 0.6, 3)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy(name="v0", hparams=None): + hparams = hparams or _HPARAMS_DEFAULT + if name == "original": + return auto_augment_policy_original(hparams) + elif name == "originalr": + return auto_augment_policy_originalr(hparams) + elif name == "v0": + return auto_augment_policy_v0(hparams) + elif name == "v0r": + return auto_augment_policy_v0r(hparams) + else: + assert AssertionError, "Unknown AA policy (%s)" % name + + +def auto_augment_transform(config_str, hparams): + """ + Create a AutoAugment transform + :param config_str: String defining configuration of auto augmentation. + Consists of multiple sections separated by dashes ('-'). The first + section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', + 'originalr'). + The remaining sections, not order sepecific determine + 'mstd' - float std deviation of magnitude noise applied + Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5 + :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme + :return: A PyTorch compatible Transform + """ + config = config_str.split("-") + policy_name = config[0] + config = config[1:] + for c in config: + cs = re.split(r"(\d.*)", c) + if len(cs) < 2: + continue + key, val = cs[:2] + if key == "mstd": + # noise param injected via hparams for now + hparams.setdefault("magnitude_std", float(val)) + else: + assert AssertionError, "Unknown AutoAugment config section" + aa_policy = auto_augment_policy(policy_name, hparams=hparams) + return AutoAugment(aa_policy) + + +_RAND_TRANSFORMS = [ + "AutoContrast", + "Equalize", + "Invert", + "Rotate", + "Posterize", + "Solarize", + "SolarizeAdd", + "Color", + "Contrast", + "Brightness", + "Sharpness", + "ShearX", + "ShearY", + "TranslateXRel", + "TranslateYRel", +] + + +_RAND_INCREASING_TRANSFORMS = [ + "AutoContrast", + "Equalize", + "Invert", + "Rotate", + "PosterizeIncreasing", + "SolarizeIncreasing", + "SolarizeAdd", + "ColorIncreasing", + "ContrastIncreasing", + "BrightnessIncreasing", + "SharpnessIncreasing", + "ShearX", + "ShearY", + "TranslateXRel", + "TranslateYRel", +] + + +# These experimental weights are based loosely on the relative improvements mentioned in paper. +# They may not result in increased performance, but could likely be tuned to so. +_RAND_CHOICE_WEIGHTS_0 = { + "Rotate": 0.3, + "ShearX": 0.2, + "ShearY": 0.2, + "TranslateXRel": 0.1, + "TranslateYRel": 0.1, + "Color": 0.025, + "Sharpness": 0.025, + "AutoContrast": 0.025, + "Solarize": 0.005, + "SolarizeAdd": 0.005, + "Contrast": 0.005, + "Brightness": 0.005, + "Equalize": 0.005, + "Posterize": 0, + "Invert": 0, +} + + +def _select_rand_weights(weight_idx=0, transforms=None): + transforms = transforms or _RAND_TRANSFORMS + assert weight_idx == 0 # only one set of weights currently + rand_weights = _RAND_CHOICE_WEIGHTS_0 + probs = [rand_weights[k] for k in transforms] + probs /= np.sum(probs) + return probs + + +def rand_augment_ops(magnitude=10, hparams=None, transforms=None): + hparams = hparams or _HPARAMS_DEFAULT + transforms = transforms or _RAND_TRANSFORMS + return [ + AugmentOp(name, prob=0.5, magnitude=magnitude, hparams=hparams) + for name in transforms + ] diff --git a/vissl/hooks/__init__.py b/vissl/hooks/__init__.py index be02d7c28..132f8f4ea 100644 --- a/vissl/hooks/__init__.py +++ b/vissl/hooks/__init__.py @@ -5,9 +5,10 @@ from classy_vision.hooks.classy_hook import ClassyHook from vissl.hooks.deepclusterv2_hooks import ClusterMemoryHook, InitMemoryHook # noqa +from vissl.hooks.grad_clip_hooks import GradClipHook # noqa from vissl.hooks.log_hooks import ( # noqa - LogGpuStatsHook, LogGpuMemoryHook, + LogGpuStatsHook, LogLossLrEtaHook, LogLossMetricsCheckpointHook, LogPerfTimeMetricsHook, @@ -109,6 +110,15 @@ def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: assert is_tensorboard_available(), "Tensorboard must be installed to use it." tb_hook = get_tensorboard_hook(cfg) hooks.extend([tb_hook]) + if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP: + hooks.extend( + [ + GradClipHook( + norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE, + max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM, + ) + ] + ) # hooks that are used irrespective of workflow type rolling_btime_freq = ( diff --git a/vissl/hooks/grad_clip_hooks.py b/vissl/hooks/grad_clip_hooks.py new file mode 100644 index 000000000..7ad5926e8 --- /dev/null +++ b/vissl/hooks/grad_clip_hooks.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Union + +import torch.nn.utils as utils +from classy_vision import tasks +from classy_vision.hooks.classy_hook import ClassyHook + + +class GradClipHook(ClassyHook): + """ + Hook executed on a backward pass that clips gradients such that their + norm does not exceed a specific value. Dosovitskiy et al. found it + to be critical for training vision transformers + (https://arxiv.org/abs/2010.11929), but subsequent studies have been less + clear about its importance. Gradient clipping configuration is set in + config.MODEL.GRAD_CLIP + """ + + on_start = ClassyHook._noop + on_phase_start = ClassyHook._noop + on_forward = ClassyHook._noop + on_loss_and_meter = ClassyHook._noop + on_update = ClassyHook._noop + on_step = ClassyHook._noop + on_phase_end = ClassyHook._noop + on_end = ClassyHook._noop + + def __init__(self, norm_type: Union[int, float, str], max_norm: Union[int, float]): + super().__init__() + self.norm_type = norm_type + self.max_norm = max_norm + + def on_backward(self, task: tasks.ClassyTask) -> None: + utils.clip_grad_norm_( + task.model.parameters(), max_norm=self.max_norm, norm_type=self.norm_type + ) diff --git a/vissl/hooks/log_hooks.py b/vissl/hooks/log_hooks.py index 835019539..8ac63adbd 100644 --- a/vissl/hooks/log_hooks.py +++ b/vissl/hooks/log_hooks.py @@ -34,10 +34,7 @@ class LogGpuMemoryHook(ClassyHook): on_phase_end = ClassyHook._noop on_end = ClassyHook._noop - def __init__( - self, - log_iteration_num: int = 1, - ) -> None: + def __init__(self, log_iteration_num: int = 1) -> None: super().__init__() self.log_iteration_num = log_iteration_num diff --git a/vissl/hooks/tensorboard_hook.py b/vissl/hooks/tensorboard_hook.py index f9dc7a5f1..825c60395 100644 --- a/vissl/hooks/tensorboard_hook.py +++ b/vissl/hooks/tensorboard_hook.py @@ -112,6 +112,19 @@ def on_phase_end(self, task: "tasks.ClassyTask") -> None: Log model parameters and/or parameter gradients as set by user in the tensorboard configuration. Also resents the CUDA memory counter. """ + # Log train/test accuracy + if is_primary(): + phase_type = "Training" if task.train else "Testing" + for meter in task.meters: + if "accuracy" in meter.name: + for top_n, accuracies in meter.value.items(): + for i, acc in accuracies.items(): + tag_name = f"{phase_type}/Accuracy_" f" {top_n}_Output_{i}" + self.tb_writer.add_scalar( + tag=tag_name, + scalar_value=round(acc, 5), + global_step=task.train_phase_idx, + ) if not (self.log_params or self.log_params_gradients): return diff --git a/vissl/losses/cross_entropy_multiple_output_single_target.py b/vissl/losses/cross_entropy_multiple_output_single_target.py index 0e931a18e..ef4607f75 100644 --- a/vissl/losses/cross_entropy_multiple_output_single_target.py +++ b/vissl/losses/cross_entropy_multiple_output_single_target.py @@ -1,21 +1,40 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging from typing import List, Union import torch +import torch.nn.functional as F from classy_vision.generic.util import is_on_gpu from classy_vision.losses import ClassyLoss, register_loss -from torch import nn +from torch import Tensor, nn from vissl.utils.hydra_config import AttrDict +class SmoothCrossEntropy(torch.nn.modules.CrossEntropyLoss): + """ + Cross entropy loss that can accommodate smoothed labels + """ + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + if len(target.shape) > 1: + log_probs = F.log_softmax(input, 1) + # TODO: Implement weight and ignore_index + return -torch.mean(torch.sum(log_probs * target, dim=1)) + else: + return F.cross_entropy( + input, target, weight=self.weight, ignore_index=self.ignore_index + ) + + @register_loss("cross_entropy_multiple_output_single_target") class CrossEntropyMultipleOutputSingleTargetLoss(ClassyLoss): """ Intializer for the sum cross-entropy loss. For a single tensor, this is equivalent to the cross-entropy loss. For a list of tensors, this computes the sum of the cross-entropy - losses for each tensor in the list against the target. + losses for each tensor in the list against the target. Can accommodate + target vectors, e.g. smoothed labels. Config params: weight: weight of sample, optional @@ -53,10 +72,15 @@ def from_config(cls, loss_config: AttrDict): def _create_loss_function(self): copy_to_gpu = is_on_gpu(self._losses) + logging.info( + "Instantiating " + "CrossEntropyMultipleOutputSingleTargetLoss, which" + "internally uses SmoothCrossEntropy loss to accommodate" + "label smoothing, but defaults to vanilla cross-entropy " + "if provided single-target labels." + ) self._losses.append( - torch.nn.modules.CrossEntropyLoss( - weight=self._weight, ignore_index=self._ignore_index - ) + SmoothCrossEntropy(weight=self._weight, ignore_index=self._ignore_index) ) if copy_to_gpu: self._losses.cuda() diff --git a/vissl/losses/moco_loss.py b/vissl/losses/moco_loss.py index 5a438b163..758e32cf5 100644 --- a/vissl/losses/moco_loss.py +++ b/vissl/losses/moco_loss.py @@ -88,9 +88,11 @@ def _dequeue_and_enqueue(self, key: torch.Tensor): # for simplicity, removes the case where the batch overlaps with the end # of the queue - assert ( - self.loss_config.queue_size % batch_size == 0 - ), "The queue size needs to be a multiple of the batch size" + assert self.loss_config.queue_size % batch_size == 0, ( + f"The queue size needs to be a multiple of the batch size. " + f"Effective batch size: {batch_size}. Queue size:" + f" {self.loss_config.queue_size}." + ) # replace the keys at ptr (dequeue and enqueue) ptr = int(self.queue_ptr) diff --git a/vissl/meters/mean_ap_list_meter.py b/vissl/meters/mean_ap_list_meter.py index 624535a6b..3b87aeac7 100644 --- a/vissl/meters/mean_ap_list_meter.py +++ b/vissl/meters/mean_ap_list_meter.py @@ -60,10 +60,7 @@ def value(self): for ind, meter in enumerate(self._meters): meter_val = meter.value sample_count = meter._scores.shape[0] - val_dict[ind] = { - "val": meter_val, - "sample_count": sample_count, - } + val_dict[ind] = {"val": meter_val, "sample_count": sample_count} output_dict = {} output_dict["mAP"] = {} output_dict["AP"] = {} @@ -98,9 +95,7 @@ def get_classy_state(self): meter_states = {} for ind, meter in enumerate(self._meters): state = meter.get_classy_state() - meter_states[ind] = { - "state": state, - } + meter_states[ind] = {"state": state} return meter_states def set_classy_state(self, state): diff --git a/vissl/models/heads/vision_transformer_head.py b/vissl/models/heads/vision_transformer_head.py new file mode 100644 index 000000000..c27e26215 --- /dev/null +++ b/vissl/models/heads/vision_transformer_head.py @@ -0,0 +1,71 @@ +# (c) Facebook, Inc. and its affiliates. Confidential and proprietary. + +""" +Code modified from https://github.com/google-research/vision_transformer +as per https://arxiv.org/abs/2010.11929 +""" + +import copy +from collections import OrderedDict + +import torch.nn as nn +from vissl.models.heads import register_model_head +from vissl.models.model_helpers import lecun_normal_init, trunc_normal_ +from vissl.utils.hydra_config import AttrDict + + +@register_model_head("vision_transformer_head") +class VisionTransformerHead(nn.Module): + """ + Code modified from https://github.com/google-research/vision_transformer + and https://www.internalfb.com/D24714842, as per https://arxiv.org/abs/2010.11929 + + Authors use a 2-layer MLP for pretraining and a single linear layer for + fine-tuning. Thus a pre-training head would be called with something like + ["vision_transformer_head", {"in_plane": D, "hidden_dim": D, + "num_classes": K}], where D = hidden dimensionality and K = number of + classes. A fine-tuning head would be called ["vision_transformer_head", + {"in_plane", D, "num_classes": K]. Not passing "hidden_dim" will result + in a single linear layer. + + """ + + def __init__(self, model_config: AttrDict, in_plane, num_classes, hidden_dim=None): + super().__init__() + if hidden_dim is None: + layers = [("head", nn.Linear(in_plane, num_classes))] + else: + layers = [ + ("pre_logits", nn.Linear(in_plane, hidden_dim)), + ("act", nn.Tanh()), + ("head", nn.Linear(hidden_dim, num_classes)), + ] + self.layers = nn.Sequential(OrderedDict(layers)) + self.init_weights() + + def init_weights(self): + if hasattr(self.layers, "pre_logits"): + lecun_normal_init( + self.layers.pre_logits.weight, fan_in=self.layers.pre_logits.in_features + ) + nn.init.zeros_(self.layers.pre_logits.bias) + trunc_normal_(self.layers.head.weight, std=0.02) + nn.init.zeros_(self.layers.head.bias) + + @classmethod + def from_config(cls, config): + """ + config is config.MODEL.HEAD.PARAMS, which is a list of the form: + [ + ["vision_transformer_head", {"in_plane": _, "hidden_dim": _, "num_classes": _}] + ] + Where in_plane is the input dimensionality to the head, hidden_dim is + the hidden layer width (omit if no hidden layer is desired), + and num_classes is the output dimensionality. + """ + config = copy.deepcopy(config) + config.pop("unique_id") + return cls(**config) + + def forward(self, x): + return self.layers(x) diff --git a/vissl/models/model_helpers.py b/vissl/models/model_helpers.py index 265ecadd5..bb4445a79 100644 --- a/vissl/models/model_helpers.py +++ b/vissl/models/model_helpers.py @@ -1,11 +1,15 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import logging +import math +import warnings from enum import Enum from typing import Dict, List, Tuple import torch import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.utils import _ntuple from torch.utils.checkpoint import checkpoint from vissl.utils.activation_checkpointing import checkpoint_trunk from vissl.utils.misc import is_apex_available @@ -244,15 +248,23 @@ class RESNET_NORM_LAYER(str, Enum): BatchNorm = "BatchNorm" LayerNorm = "LayerNorm" + GroupNorm = "GroupNorm" -def _get_norm(layer_name): +def _get_norm(trunk_config): """ return the normalization layer to use in the model based on the layer name """ + layer_name = trunk_config.NORM + n_groups = trunk_config.GROUPNORM_GROUPS + + def group_norm(num_channels): + return nn.GroupNorm(num_groups=n_groups, num_channels=num_channels) + return { RESNET_NORM_LAYER.BatchNorm: nn.BatchNorm2d, RESNET_NORM_LAYER.LayerNorm: LayerNorm2d, + RESNET_NORM_LAYER.GroupNorm: group_norm, }[layer_name] @@ -398,3 +410,154 @@ def get_trunk_forward_outputs( output_feats.append(unique_out_feats[key_name]) return output_feats + + +def lecun_normal_init(tensor, fan_in): + trunc_normal_(tensor, std=math.sqrt(1 / fan_in)) + + +# Contains code from https://github.com/rwightman/pytorch-image-models +# and https://github.com/facebookresearch/deit/blob/main/models.py, modified by +# Matthew # Leavitt (ito@fb.com, matthew.l.leavitt@gmail.com) and Vedanuj +# Goswami (vedanuj@fb.com). +# trunc_normal_ and _no_grad_trunc_normal_ from: +# https://github.com/rwightman/pytorch-image-models/blob/678ba4e0a2c0b52c5e7b2ec0ba689399840282ee/timm/models/layers/weight_init.py # NOQA +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + r"""Supposedly should be available in PyTorch soon. Replace when available. + Fills the input Tensor with values drawn + from a truncated normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +# Contains code from https://github.com/rwightman/pytorch-image-models +# and https://github.com/facebookresearch/deit/blob/main/models.py, modified by +# Matthew # Leavitt (ito@fb.com, matthew.l.leavitt@gmail.com) and Vedanuj +# Goswami (vedanuj@fb.com). +# Standardized convolution (Conv2d with Weight Standardization), as used in +# the paper, Big Transfer (BiT): General Visual Representation Learning - +# https://arxiv.org/abs/1912.11370 +class StandardizedConv2d(nn.Conv2d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + ): + super(StandardizedConv2d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + ) + + def forward(self, x): + weight = self.weight + weight_mean = ( + weight.mean(dim=1, keepdim=True) + .mean(dim=2, keepdim=True) + .mean(dim=3, keepdim=True) + ) + weight = weight - weight_mean + std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5 + weight = weight / std.expand_as(weight) + return F.conv2d( + x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups + ) + + +# drop_path and DropPath modified from +# https://github.com/facebookresearch/deit/blob/main/models.py +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + # work with diff dim tensors, not just 2D ConvNets + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path + of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple diff --git a/vissl/models/trunks/alexnet_colorization.py b/vissl/models/trunks/alexnet_colorization.py index 0738e33fe..40376a83b 100644 --- a/vissl/models/trunks/alexnet_colorization.py +++ b/vissl/models/trunks/alexnet_colorization.py @@ -2,10 +2,7 @@ import torch import torch.nn as nn -from vissl.models.model_helpers import ( - Flatten, - get_trunk_forward_outputs_module_list, -) +from vissl.models.model_helpers import Flatten, get_trunk_forward_outputs_module_list from vissl.models.trunks import register_model_trunk from vissl.utils.hydra_config import AttrDict @@ -87,9 +84,6 @@ def forward(self, x, out_feat_keys=None): # along the channel dimension into [L, AB] and keep only L channel. feat = torch.split(feat, [1, 2], dim=1)[0] out_feats = get_trunk_forward_outputs_module_list( - feat, - out_feat_keys, - self._feature_blocks, - self.all_feat_names, + feat, out_feat_keys, self._feature_blocks, self.all_feat_names ) return out_feats diff --git a/vissl/models/trunks/alexnet_deepcluster.py b/vissl/models/trunks/alexnet_deepcluster.py index b73e9b723..dff97ad18 100644 --- a/vissl/models/trunks/alexnet_deepcluster.py +++ b/vissl/models/trunks/alexnet_deepcluster.py @@ -101,9 +101,6 @@ def forward(self, x, out_feat_keys=None): # we first apply sobel filter feat = self.sobel(feat) out_feats = get_trunk_forward_outputs_module_list( - feat, - out_feat_keys, - self._feature_blocks, - self.all_feat_names, + feat, out_feat_keys, self._feature_blocks, self.all_feat_names ) return out_feats diff --git a/vissl/models/trunks/alexnet_jigsaw.py b/vissl/models/trunks/alexnet_jigsaw.py index 7c07fc614..ff329628b 100644 --- a/vissl/models/trunks/alexnet_jigsaw.py +++ b/vissl/models/trunks/alexnet_jigsaw.py @@ -80,9 +80,6 @@ def __init__(self, model_config: AttrDict, model_name: str): def forward(self, x, out_feat_keys=None): feat = x out_feats = get_trunk_forward_outputs_module_list( - feat, - out_feat_keys, - self._feature_blocks, - self.all_feat_names, + feat, out_feat_keys, self._feature_blocks, self.all_feat_names ) return out_feats diff --git a/vissl/models/trunks/alexnet_rotnet.py b/vissl/models/trunks/alexnet_rotnet.py index 48b644cf1..62a9daf3d 100644 --- a/vissl/models/trunks/alexnet_rotnet.py +++ b/vissl/models/trunks/alexnet_rotnet.py @@ -63,9 +63,6 @@ def __init__(self, model_config: AttrDict, model_name: str): def forward(self, x, out_feat_keys=None): feat = x out_feats = get_trunk_forward_outputs_module_list( - feat, - out_feat_keys, - self._feature_blocks, - self.all_feat_names, + feat, out_feat_keys, self._feature_blocks, self.all_feat_names ) return out_feats diff --git a/vissl/models/trunks/resnext.py b/vissl/models/trunks/resnext.py index 60e0ccb73..1f260c4ee 100644 --- a/vissl/models/trunks/resnext.py +++ b/vissl/models/trunks/resnext.py @@ -67,7 +67,7 @@ def __init__(self, model_config: AttrDict, model_name: str): self.trunk_config = self.model_config.TRUNK.TRUNK_PARAMS.RESNETS self.depth = SUPPORTED_DEPTHS(self.trunk_config.DEPTH) self.width_multiplier = self.trunk_config.WIDTH_MULTIPLIER - self._norm_layer = _get_norm(self.trunk_config.NORM) + self._norm_layer = _get_norm(self.trunk_config) self.groups = self.trunk_config.GROUPS self.zero_init_residual = self.trunk_config.ZERO_INIT_RESIDUAL self.width_per_group = self.trunk_config.WIDTH_PER_GROUP diff --git a/vissl/models/trunks/vision_transformer.py b/vissl/models/trunks/vision_transformer.py new file mode 100644 index 000000000..d869ddaba --- /dev/null +++ b/vissl/models/trunks/vision_transformer.py @@ -0,0 +1,306 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +""" +Code modified from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py # NOQA +and https://github.com/facebookresearch/deit/blob/main/models.py by Matthew +Leavitt (ito@fb.com, matthew.l.leavitt@gmail.com) and Vedanuj Goswami +(vedanuj@fb.com). +""" + +import copy +import logging +import math +from typing import List + +import torch +import torch.nn as nn +from vissl.models.model_helpers import DropPath, to_2tuple, trunc_normal_ +from vissl.models.trunks import register_model_trunk +from vissl.utils.hydra_config import AttrDict + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, + # can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + if drop_path > 0.0: + self.drop_path = DropPath(drop_path) + else: + self.drop_path = nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + + def forward(self, x): + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +@register_model_trunk("vision_transformer") +class VisionTransformer(nn.Module): + """ + Vision transformer. Adding stochastic depth makes it a DeiT. + """ + + def __init__(self, model_config: AttrDict, model_name: str): + super().__init__() + + assert model_config.INPUT_TYPE in ["rgb", "bgr"], "Input type not supported" + trunk_config = copy.deepcopy( + model_config.TRUNK.TRUNK_PARAMS.VISION_TRANSFORMERS + ) + + logging.info("Building model: Vision Transformer from yaml config") + # Hacky workaround + trunk_config = AttrDict({k.lower(): v for k, v in trunk_config.items()}) + + img_size = trunk_config.image_size + patch_size = trunk_config.patch_size + in_chans = 3 + embed_dim = trunk_config.hidden_dim + depth = trunk_config.num_layers + num_heads = trunk_config.num_heads + mlp_ratio = 4.0 + qkv_bias = trunk_config.qkv_bias + qk_scale = trunk_config.qk_scale + drop_rate = trunk_config.dropout_rate + attn_drop_rate = trunk_config.attention_dropout_rate + drop_path_rate = trunk_config.drop_path_rate + hybrid_backbone_string = None + # TODO Implement hybrid backbones + if "HYBRID" in trunk_config.keys(): + hybrid_backbone_string = trunk_config.HYBRID + norm_layer = nn.LayerNorm + + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + + # TODO : Enable Hybrid Backbones + if hybrid_backbone_string: + self.patch_embed = globals()[hybrid_backbone_string]( + out_dim=embed_dim, img_size=img_size + ) + # if hybrid_backbone is not None: + # self.patch_embed = HybridEmbed( + # hybrid_backbone, + # img_size=img_size, + # in_chans=in_chans, + # embed_dim=embed_dim, + # ) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + + self.class_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embedding = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.ModuleList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) + + # NOTE as per official impl, we could have a pre-logits + # representation dense layer + tanh here + # self.repr = nn.Linear(embed_dim, representation_size) + # self.repr_act = nn.Tanh() + + trunc_normal_(self.pos_embedding, std=0.02) + trunc_normal_(self.class_token, std=0.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embedding", "class_token"} + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + class_tokens = self.class_token.expand( + B, -1, -1 + ) # stole class_tokens impl from Phil Wang, thanks + x = torch.cat((class_tokens, x), dim=1) + pos_embed = self.interpolate_pos_encoding(x, self.pos_embedding) + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x[:, 0] + + def forward( + self, x: torch.Tensor, out_feat_keys: List[str] = None + ) -> List[torch.Tensor]: + x = self.forward_features(x) + x = x.unsqueeze(0) + return x + + def interpolate_pos_encoding(self, x, pos_embed): + npatch = x.shape[1] - 1 + N = pos_embed.shape[1] - 1 + if npatch == N: + return pos_embed + class_emb = pos_embed[:, 0] + pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + pos_embed = nn.functional.interpolate( + pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute( + 0, 3, 1, 2 + ), + scale_factor=math.sqrt(npatch / N), + mode="bicubic", + ) + pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_emb.unsqueeze(0), pos_embed), dim=1) diff --git a/vissl/optimizers/optimizer_helper.py b/vissl/optimizers/optimizer_helper.py index 3d8aa708e..732d375e4 100644 --- a/vissl/optimizers/optimizer_helper.py +++ b/vissl/optimizers/optimizer_helper.py @@ -52,6 +52,53 @@ def _filter_trainable(param_list: List[Any]) -> List[Any]: return list(filter(lambda x: x.requires_grad, param_list)) +def _assign_regularized_params( + regularized_param_list=None, + unregularized_param_list=None, + parameters_to_unregularize=None, +): + """ + Takes a list parameters_to_unregularize (a list of parameters to ensure are + not regularized) and compares it to regularized_param_list, a list of + regularized parameters. Any parameters in parameters_to_unregularize that + are present in regularized_param_list are removed from + regularized_param_list. Will also check against an optional + unregularized_param_list (pre-existing list of parameters not to regularize) + and remove any items from parameters_to_unregularize that are in + unregularized_param_list. Used for when we have parameters that we don't + want to regularize (e.g. the class token and position embeddings for the + vision transformer). See config.OPTIMIZER.non_regularized_params. Needs + to be called separately for head, trunk, and remaining params. + """ + indices_to_remove_from_regularized = [] + indices_to_remove_from_new_unregularized = [] + # Iterate through new parameters to unregularize + for unreg_param_ind, new_unreg_param in enumerate(parameters_to_unregularize): + # Iterate through list of regularized parameters + for reg_param_ind, reg_param in enumerate(regularized_param_list): + # Note any matchess + if reg_param is new_unreg_param: + indices_to_remove_from_regularized.append(reg_param_ind) + if unregularized_param_list: + # Iterate through pre-existing list of unregularized parameters + for unreg_param in unregularized_param_list: + # Note any matches + if unreg_param is new_unreg_param: + indices_to_remove_from_new_unregularized.append(unreg_param_ind) + indices_to_remove_from_regularized.sort(reverse=True) + # Iterate through indices to remove from list regularized params and + # remove them + for i in indices_to_remove_from_regularized: + del regularized_param_list[i] + if unregularized_param_list: + indices_to_remove_from_new_unregularized.sort(reverse=True) + # Iterate through indices to remove from new list of unregularized + # parameters + for i in indices_to_remove_from_new_unregularized: + del parameters_to_unregularize[i] + return parameters_to_unregularize, regularized_param_list, unregularized_param_list + + def get_optimizer_param_groups( model, model_config, optimizer_config, optimizer_schedulers ): @@ -91,6 +138,7 @@ def get_optimizer_param_groups( head_regularized_params, head_unregularized_params = [], [] # for anything else regularized_params = [] + unregularized_params = [] for name, module in model.named_modules(): # head, Linear/Conv layer if "head" in name and ( @@ -140,6 +188,41 @@ def get_optimizer_param_groups( for params in module.parameters(recurse=False): regularized_params.append(params) + # Collect user-specified non-regularized params and remove them for the + # lists of regularized params, and check they're not already on the lists + # of unregularized params + if optimizer_config.non_regularized_parameters: + non_reg_param_names = optimizer_config.non_regularized_parameters + for name, param in model.named_parameters(): + hits = [p for p in non_reg_param_names if p in name] + if any(hits): + unregularized_params.append(param) + # Call for trunk params + ( + non_reg_params, + trunk_regularized_params, + trunk_unregularized_params, + ) = _assign_regularized_params( + parameters_to_unregularize=unregularized_params, + regularized_param_list=trunk_regularized_params, + unregularized_param_list=trunk_unregularized_params, + ) + # Call for head params + ( + non_reg_params, + head_regularized_params, + head_unregularized_params, + ) = _assign_regularized_params( + parameters_to_unregularize=unregularized_params, + regularized_param_list=head_regularized_params, + unregularized_param_list=head_unregularized_params, + ) + # Call for remaining params + non_reg_params, regularized_params, _ = _assign_regularized_params( + parameters_to_unregularize=unregularized_params, + regularized_param_list=regularized_params, + ) + # for non-trainable params, set the requires_grad to False non_trainable_params = [] for name, param in model.named_parameters(): @@ -160,7 +243,8 @@ def get_optimizer_param_groups( f"Trunk Unregularized Parameters {len(trunk_unregularized_params)}, \n" f"Head Regularized Parameters: {len(head_regularized_params)}, \n" f"Head Unregularized Parameters: {len(head_unregularized_params)} \n" - f"Remaining Regularized Parameters: {len(regularized_params)} " + f"Remaining Regularized Parameters: {len(regularized_params)} \n" + f"Remaining Unregularized Parameters: {len(unregularized_params)}" ) param_groups = [ @@ -189,5 +273,13 @@ def get_optimizer_param_groups( param_groups.append( {"params": regularized_params, "lr": optimizer_schedulers["lr"]} ) + if len(unregularized_params) > 0: + param_groups.append( + { + "params": unregularized_params, + "lr": optimizer_schedulers["lr"], + "weight_decay": 0.0, + } + ) return param_groups diff --git a/vissl/trainer/train_task.py b/vissl/trainer/train_task.py index c53a6f281..5a7eaad2a 100644 --- a/vissl/trainer/train_task.py +++ b/vissl/trainer/train_task.py @@ -4,10 +4,7 @@ import logging import torch -from classy_vision.generic.util import ( - copy_model_to_gpu, - load_and_broadcast_checkpoint, -) +from classy_vision.generic.util import copy_model_to_gpu, load_and_broadcast_checkpoint from classy_vision.losses import build_loss from classy_vision.meters import build_meter from classy_vision.optim import build_optimizer, build_optimizer_schedulers diff --git a/vissl/utils/activation_checkpointing.py b/vissl/utils/activation_checkpointing.py index 0ee636f95..d512418f5 100644 --- a/vissl/utils/activation_checkpointing.py +++ b/vissl/utils/activation_checkpointing.py @@ -131,12 +131,7 @@ def checkpoint_trunk( feature_blocks_bucketed = ( feature_blocks_bucketed[:i_max] - + [ - [ - f"activation_split_{split_times}", - biggest_block[1][:n_split_layers], - ] - ] + + [[f"activation_split_{split_times}", biggest_block[1][:n_split_layers]]] + [[biggest_block[0], biggest_block[1][n_split_layers:]]] + feature_blocks_bucketed[(i_max + 1) :] ) diff --git a/vissl/utils/checkpoint.py b/vissl/utils/checkpoint.py index f2f17ab1d..8427d1bc9 100644 --- a/vissl/utils/checkpoint.py +++ b/vissl/utils/checkpoint.py @@ -445,6 +445,13 @@ def init_model_from_weights( and config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_TRUNK_AND_HEAD ) ): + # Accommodate changing position embeddings. Fine-tuning at a + # different resolution than that which a model was pretrained + # at requires interpolating the learned position embeddings. + if "pos_embedding" in layername: + param = interpolate_position_embeddings( + model, all_layers[layername], param + ) assert all_layers[layername].shape == param.shape, ( f"{layername} have different shapes: " f"checkpoint: {param.shape}, model: {all_layers[layername].shape}" @@ -472,3 +479,21 @@ def init_model_from_weights( ####################### DEBUG ############################ # print_state_dict_shapes(model.state_dict()) return model + + +def interpolate_position_embeddings(model, layer, param): + """ + Fine-tuning at a different resolution than that which a model was + pretrained at requires interpolating the learned position embeddings. + """ + if ( + hasattr(model.trunk, "interpolate_position_embedding") + and layer.shape != param.shape + ): + interp = model.trunk.interpolate_position_embedding + if callable(interp): + try: + param = interp(param) + except BaseException: + raise RuntimeError("Unable to interpolate position embeddings") + return param diff --git a/vissl/utils/hydra_config.py b/vissl/utils/hydra_config.py index 06e3e1e52..743702603 100644 --- a/vissl/utils/hydra_config.py +++ b/vissl/utils/hydra_config.py @@ -381,6 +381,7 @@ def infer_losses_config(cfg): assert cfg.DATA.TRAIN.COLLATE_FUNCTION in [ "multicrop_collator", "multicrop_mixup_collator", + "cutmixup_collator", ], ( "for swav loss, use either a collator from " "[multicrop_collator, multicrop_mixup_collator]" @@ -480,10 +481,13 @@ def assert_hydra_conf(cfg): # in SSL, during pre-training we don't want to use annotated labels or during feature # extraction, we don't have annotated labels for some datasets. In such cases, we set - # the label type to be just the image index in the dataset. - if len(cfg.DATA.TRAIN.LABEL_SOURCES) == 0: + # the label type to be just the image index in the dataset, unless the + # user has specifically provided "zero" as the label type, which is + # necessary when the CutMixUp collator is being used for self-supervised + # training. + if len(cfg.DATA.TRAIN.LABEL_SOURCES) == 0 and cfg.DATA.TRAIN.LABEL_TYPE != "zero": cfg.DATA.TRAIN.LABEL_TYPE = "sample_index" - if len(cfg.DATA.TEST.LABEL_SOURCES) == 0: + if len(cfg.DATA.TEST.LABEL_SOURCES) == 0 and cfg.DATA.TEST.LABEL_TYPE != "zero": cfg.DATA.TEST.LABEL_TYPE = "sample_index" # if the user has specified the model initialization from a params_file, we check if