diff --git a/bash_files/pretrain/cifar/dino.sh b/bash_files/pretrain/cifar/dino.sh index 3f8668c..47f40cf 100644 --- a/bash_files/pretrain/cifar/dino.sh +++ b/bash_files/pretrain/cifar/dino.sh @@ -1,4 +1,4 @@ -python3 ../../main_pretrain.py \ +python3 ../../../main_pretrain.py \ --dataset $1 \ --backbone resnet18 \ --data_dir ./datasets \ @@ -37,4 +37,5 @@ python3 ../../main_pretrain.py \ --num_prototypes 4096 \ --base_tau_momentum 0.9995 \ --final_tau_momentum 1.0 \ - --momentum_classifier + --momentum_classifier \ + $2 \ No newline at end of file diff --git a/bash_files/pretrain/cifar/exe.py b/bash_files/pretrain/cifar/exe.py index f67b6d7..1bc1697 100644 --- a/bash_files/pretrain/cifar/exe.py +++ b/bash_files/pretrain/cifar/exe.py @@ -1,3 +1,8 @@ +rate = '0.60' +dataset = 'cifar10' +# poison_method = 'zoo-simclr' +poison_method = 'clb' +import time def sweep_poison_rate(args): i = 0 @@ -18,24 +23,78 @@ def sweep_poison_rate(args): if i >= len(args.gpus): return -def sweep_trigger(args): +def sweep_pretrain_method(args): + i = 0 + # for dataset in ['cifar10', 'cifar100']: + # for method in 'dino'.split(' '): + for method in ['sup', 'supcon', 'simclr', 'mocov2plus', 'byol', 'simsiam', 'swav', 'dino', 'barlow']: + # for rate in '0.10 0.20 0.30 0.40 0.50 0.60 0.70 0.80 0.90 1.00'.split(' '): + gpu = args.gpus[i] + print(rate) + + os.system(f""" + for file in /data/yfwang/solo-learn/poison_datasets/{dataset}/{poison_method}/gaussian_noise/{dataset}_{poison_method}_rate_{rate}_*.pt + do + # echo ${{file}}, {method} + # CUDA_VISIBLE_DEVICES={gpu} sh {method}.sh {dataset} " --poison_data ${{file}} --use_poison --checkpoint_dir /data/yfwang/solo-learn/pretrain/{dataset} " & + done + """ + ) + i += 1 + if i >= len(args.gpus): + return + time.sleep(1) + + +def sweep_eval(args): i = 0 for dataset in ['cifar10', 'cifar100']: - for rate in '0.10 0.20 0.30 0.40 0.50 0.60 0.70 0.80 0.90 1.00'.split(' '): - gpu = args.gpus[i] - print(rate) - os.system(f""" - for file in /data/yfwang/solo-learn/poison_datasets/{dataset}/zoo-simclr/gaussian_noise/{dataset}_zoo-simclr_rate_{rate}_*.pt - do - # echo ${{file}} - CUDA_VISIBLE_DEVICES={gpu} sh simclr.sh {dataset} " --poison_data ${{file}} --use_poison --checkpoint_dir /data/yfwang/solo-learn/pretrain/{dataset} " & - done - """ - ) - i += 1 - if i >= len(args.gpus): - return + # for method in 'dino'.split(' '): + for method in ['sup','simclr']: + for poison_method in ['zoo-simclr', 'clb']: + # for apply_method in ['use_poison', 'eval_poison']: + for apply_method in ['eval_poison']: + # for rate in '0.10 0.20 0.30 0.40 0.50 0.60 0.70 0.80 0.90 1.00'.split(' '): + gpu = args.gpus[i] + print(rate) + + os.system(f""" + for file in /data/yfwang/solo-learn/poison_datasets/{dataset}/{poison_method}/gaussian_noise/{dataset}_{poison_method}_rate_{rate}_*.pt + do + # echo ${{file}}, {method} + CUDA_VISIBLE_DEVICES={gpu} sh {method}.sh {dataset} " --poison_data ${{file}} --{apply_method} --checkpoint_dir /data/yfwang/solo-learn/pretrain/{dataset} " & + done + """ + ) + i += 1 + if i >= len(args.gpus): + return + time.sleep(1) + +def sweep_cifar100(args): + i = 0 + for dataset in ['cifar100']: + # for method in 'dino'.split(' '): + for method in ['sup','simclr']: + for poison_method in ['zoo-simclr', 'clb']: + # for apply_method in ['use_poison', 'eval_poison']: + for apply_method in ['use_poison']: + # for rate in '0.10 0.20 0.30 0.40 0.50 0.60 0.70 0.80 0.90 1.00'.split(' '): + gpu = args.gpus[i] + print(rate) + os.system(f""" + for file in /data/yfwang/solo-learn/poison_datasets/{dataset}/{poison_method}/gaussian_noise/{dataset}_{poison_method}_rate_{rate}_*.pt + do + # echo ${{file}}, {method} + CUDA_VISIBLE_DEVICES={gpu} sh {method}.sh {dataset} " --poison_data ${{file}} --{apply_method} --checkpoint_dir /data/yfwang/solo-learn/pretrain/{dataset} " & + done + """ + ) + i += 1 + if i >= len(args.gpus): + return + time.sleep(1) if __name__ == "__main__": import argparse @@ -46,4 +105,7 @@ def sweep_trigger(args): parser.add_argument('gpus', type=int, nargs="+", help="") args = parser.parse_args() - sweep_poison_rate(args) \ No newline at end of file + # sweep_pretrain_method(args) + # sweep_pretrain_method(args) + # sweep_eval(args) + sweep_cifar100(args) \ No newline at end of file diff --git a/bash_files/pretrain/cifar/sup.sh b/bash_files/pretrain/cifar/sup.sh new file mode 100644 index 0000000..242d19d --- /dev/null +++ b/bash_files/pretrain/cifar/sup.sh @@ -0,0 +1,31 @@ +python3 ../../../main_pretrain.py \ + --dataset $1 \ + --backbone resnet18 \ + --data_dir ./datasets \ + --max_epochs 1000 \ + --gpus 0 \ + --accelerator gpu \ + --precision 16 \ + --optimizer sgd \ + --scheduler warmup_cosine \ + --lr 0.1 \ + --classifier_lr 0.1 \ + --weight_decay 1e-5 \ + --batch_size 256 \ + --num_workers 4 \ + --crop_size 32 \ + --brightness 0.0 \ + --contrast 0.0 \ + --saturation 0.0 \ + --hue 0.0 \ + --gaussian_prob 0.0 0.0 \ + --crop_size 32 \ + --num_crops_per_aug 1 1 \ + --zero_init_residual \ + --name sup-$1 \ + --project solo-learn \ + --entity doxawang \ + --save_checkpoint \ + --method sup \ + $2 \ + --wandb \ No newline at end of file diff --git a/main_poison.py b/main_poison.py index c6f0180..3ea3194 100644 --- a/main_poison.py +++ b/main_poison.py @@ -42,8 +42,7 @@ from solo.utils.classification_dataloader import prepare_data_no_aug from poisoning_utils import * -def main(): - args = parse_args_linear() +def main_lfb(args): assert args.backbone in BaseMethod._SUPPORTED_BACKBONES backbone_model = { @@ -119,14 +118,16 @@ def main(): # subset_indices = np.random.choice(len(train_features), 100*args.num_classes, replace=False) # plot_tsne(train_features.cpu()[subset_indices], train_labels[subset_indices], args.num_classes) - # step 1: get anchor num_poisons = int(args.poison_rate * len(train_features) / args.num_classes) + + # step 1: get anchor if args.target_class is None: anchor_idx = untargeted_anchor_selection(train_features, num_poisons) else: - all_index = torch.arange(len(train_features)) - anchor_idx = all_index[train_labels == args.target_class][args.target_index] + anchor_idx = targeted_anchor_selection(train_features, train_labels, args.target_class, num_poisons) + # all_index = torch.arange(len(train_features)) + # anchor_idx = all_index[train_labels == args.target_class][args.target_index] anchor_feature = train_features[anchor_idx] anchor_label = train_labels[anchor_idx] @@ -135,6 +136,7 @@ def main(): # step 2: get poisoning subset by selecting KNN (including anchor itself) poisoning_index = get_poisoning_indices(anchor_feature, train_features, num_poisons) poisoning_index = poisoning_index.cpu() + # step 3: injecting triggers to the subset pattern, mask = generate_trigger(trigger_type=args.trigger_type) poison_images = add_trigger(train_images, pattern, mask, poisoning_index, args.trigger_alpha) @@ -146,18 +148,6 @@ def main(): print('ratio of same-class (class {%d}) samples: %.4f ' % ( anchor_label, acc)) - - args.poison_data_name = "%s_%s_rate_%.2f_target_%s_trigger_%s_alpha_%.2f_class_%d_acc_%.4f" % ( - args.dataset, - args.pretrain_method, - args.poison_rate, - args.target_class, - args.trigger_type, - args.trigger_alpha, - anchor_label, - acc) - - poisoning_data = { 'clean_data': train_images, 'poison_data': poison_images, @@ -167,18 +157,58 @@ def main(): 'anchor_label': anchor_label, 'pattern': pattern, 'mask': mask, - 'args': args, 'acc': acc, } - args.save_dir = os.path.join(args.save_dir, args.dataset, args.pretrain_method, args.trigger_type) + return poisoning_data - os.makedirs(args.save_dir, exist_ok=True) - file_name = os.path.join(args.save_dir, args.poison_data_name + '.pt') - print('saving to %s' % file_name) +def main_clb(args): + + train_loader, _, train_dataset, _ = prepare_data_no_aug( + args.dataset, + data_dir=args.data_dir, + train_dir=args.train_dir, + val_dir=args.val_dir, + batch_size=args.batch_size, + num_workers=args.num_workers, + ) + + train_images, train_labels = train_dataset.data, np.array(train_dataset.targets) + num_poisons = int(args.poison_rate * len(train_images) / args.num_classes) + + assert args.target_class is not None + poisoning_index = torch.arange(len(train_images))[train_labels == args.target_class] + shuffle_idx = torch.randperm(len(poisoning_index)) + poisoning_index = poisoning_index[shuffle_idx] + poisoning_index = poisoning_index[:num_poisons].cpu() + + anchor_label = args.target_class + + # step 3: injecting triggers to the subset + pattern, mask = generate_trigger(trigger_type=args.trigger_type) + poison_images = add_trigger(train_images, pattern, mask, poisoning_index, args.trigger_alpha) + + poisoning_labels = np.array(train_labels)[poisoning_index] - torch.save(poisoning_data, file_name) + acc = (poisoning_labels == anchor_label).astype(np.float).mean() + + print('ratio of same-class (class {%d}) samples: %.4f ' % ( + anchor_label, acc)) + + poisoning_data = { + 'clean_data': train_images, + 'poison_data': poison_images, + 'targets': train_labels, + 'poisoning_index': poisoning_index, + 'anchor_data': None, + 'anchor_label': anchor_label, + 'pattern': pattern, + 'mask': mask, + 'acc': acc, + } + + return poisoning_data def test(model, data_loader): @@ -197,5 +227,31 @@ def test(model, data_loader): acc = float(total_correct) / len(data_loader.dataset) return loss, acc + if __name__ == "__main__": - main() + args = parse_args_linear() + + if args.pretrain_method == 'clb': + poison_data = main_clb(args) + else: + poison_data = main_lfb(args) + + args.poison_data_name = "%s_%s_rate_%.2f_target_%s_trigger_%s_alpha_%.2f_class_%d_acc_%.4f" % ( + args.dataset, + args.pretrain_method, + args.poison_rate, + args.target_class, + args.trigger_type, + args.trigger_alpha, + poison_data['anchor_label'], + poison_data['acc']) + + args.save_dir = os.path.join(args.save_dir, args.dataset, args.pretrain_method, args.trigger_type) + + os.makedirs(args.save_dir, exist_ok=True) + file_name = os.path.join(args.save_dir, args.poison_data_name + '.pt') + print('saving to %s' % file_name) + + poison_data['args'] = args + + torch.save(poison_data, file_name) \ No newline at end of file diff --git a/main_pretrain.py b/main_pretrain.py index 31201ef..46394fa 100644 --- a/main_pretrain.py +++ b/main_pretrain.py @@ -75,7 +75,6 @@ def main(): else: poison_data = None poison_suffix = '' - if args.num_large_crops != 2: assert args.method == "wmse" diff --git a/poisoning_utils.py b/poisoning_utils.py index b04519c..5e08c95 100644 --- a/poisoning_utils.py +++ b/poisoning_utils.py @@ -28,20 +28,36 @@ def untargeted_anchor_selection(train_features, num_poisons): idx = torch.argmax(mean_top_sim) return idx -def targeted_anchor_selection(train_features, train_labels, target_class, num_poisons, selection='first', budget=-1): - all_index = torch.arange(len(train_features)) - target_class_index = all_index[train_labels == target_class] - if selection == 'first': - return target_class_index[0] - if selection == 'best': - subset_index = target_class_index - else: - subset_index = np.random.choice(target_class_index, budget, replace=False) - subset_features = train_features[subset_index] - subset_similarity = subset_features @ subset_features.T - mean_top_sim = torch.topk(subset_similarity, num_poisons, dim=1)[0].mean(dim=1) - idx = torch.argmax(mean_top_sim) - return subset_index[idx] + +def targeted_anchor_selection(train_features, train_labels, target_class, num_poisons): + similarity = train_features @ train_features.T + mean_top_sim = torch.topk(similarity, num_poisons, dim=1)[0].mean(dim=1) + # for target_class in range(10): + from copy import deepcopy + tgt_sim = deepcopy(mean_top_sim) + + tgt_sim[train_labels!=target_class] = -1 + idx = torch.argmax(tgt_sim) + val = tgt_sim[idx] + print(target_class, (mean_top_sim > val).float().sum()) + # import pdb; pdb.set_trace() + return idx + + +# def targeted_anchor_selection(train_features, train_labels, target_class, num_poisons, selection='first', budget=-1): +# all_index = torch.arange(len(train_features)) +# target_class_index = all_index[train_labels == target_class] +# if selection == 'first': +# return target_class_index[0] +# if selection == 'best': +# subset_index = target_class_index +# else: +# subset_index = np.random.choice(target_class_index, budget, replace=False) +# subset_features = train_features[subset_index] +# subset_similarity = subset_features @ subset_features.T +# mean_top_sim = torch.topk(subset_similarity, num_poisons, dim=1)[0].mean(dim=1) +# idx = torch.argmax(mean_top_sim) +# return subset_index[idx] def get_poisoning_indices(anchor_feature, train_features, num_poisons): diff --git a/solo/args/dataset.py b/solo/args/dataset.py index f167b53..d087568 100644 --- a/solo/args/dataset.py +++ b/solo/args/dataset.py @@ -58,8 +58,9 @@ def dataset_args(parser: ArgumentParser): parser.add_argument("--target_class", type=int, default=None) parser.add_argument("--save_dir", default=Path("datasets"), type=Path) parser.add_argument("--poison_data", default=None, type=Path) - parser.add_argument("--pretrain_method", default=None, type=Path) + parser.add_argument("--pretrain_method", default=None, type=str) parser.add_argument("--target_index", default=0, type=int) + parser.add_argument("--clb", action="store_true") def augmentations_args(parser: ArgumentParser): diff --git a/solo/methods/__init__.py b/solo/methods/__init__.py index 9725496..6abeb8d 100644 --- a/solo/methods/__init__.py +++ b/solo/methods/__init__.py @@ -35,6 +35,7 @@ from solo.methods.vibcreg import VIbCReg from solo.methods.vicreg import VICReg from solo.methods.wmse import WMSE +from solo.methods.sup import Sup METHODS = { # base classes @@ -57,6 +58,7 @@ "vibcreg": VIbCReg, "vicreg": VICReg, "wmse": WMSE, + "sup": Sup } __all__ = [ "BarlowTwins", @@ -77,6 +79,7 @@ "VIbCReg", "VICReg", "WMSE", + "sup" ] try: diff --git a/solo/methods/base.py b/solo/methods/base.py index 312fa34..fce9b0d 100644 --- a/solo/methods/base.py +++ b/solo/methods/base.py @@ -445,7 +445,9 @@ def _base_shared_step(self, X: torch.Tensor, targets: torch.Tensor) -> Dict: fp_target, fp_all = false_positive(logits, targets, self.target_class) + # import pdb; pdb.set_trace() num_attack_total, num_attack_success = attack_success_rate(logits, targets, self.target_class) + return {**out, "loss": loss, "acc1": acc1, "acc5": acc5, diff --git a/solo/methods/sup.py b/solo/methods/sup.py new file mode 100644 index 0000000..4d4e229 --- /dev/null +++ b/solo/methods/sup.py @@ -0,0 +1,139 @@ +# Copyright 2021 solo-learn development team. + +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE +# FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import argparse +from typing import Any, Dict, List, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from solo.losses.simsiam import simsiam_loss_func +from solo.methods.base import BaseMethod + + +class Sup(BaseMethod): + def __init__( + self, + **kwargs, + ): + """Implements SimSiam (https://arxiv.org/abs/2011.10566). + + Args: + proj_output_dim (int): number of dimensions of projected features. + proj_hidden_dim (int): number of neurons of the hidden layers of the projector. + pred_hidden_dim (int): number of neurons of the hidden layers of the predictor. + """ + + super().__init__(**kwargs) + + # projector + # self.projector = nn.Sequential( + # nn.Linear(self.features_dim, proj_hidden_dim, bias=False), + # nn.BatchNorm1d(proj_hidden_dim), + # nn.ReLU(), + # nn.Linear(proj_hidden_dim, proj_hidden_dim, bias=False), + # nn.BatchNorm1d(proj_hidden_dim), + # nn.ReLU(), + # nn.Linear(proj_hidden_dim, proj_output_dim), + # nn.BatchNorm1d(proj_output_dim, affine=False), + # ) + # self.projector[6].bias.requires_grad = False # hack: not use bias as it is followed by BN + + # predictor + # self.predictor = nn.Sequential( + # nn.Linear(proj_output_dim, pred_hidden_dim, bias=False), + # nn.BatchNorm1d(pred_hidden_dim), + # nn.ReLU(), + # nn.Linear(pred_hidden_dim, proj_output_dim), + # ) + + @staticmethod + def add_model_specific_args(parent_parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parent_parser = super(Sup, Sup).add_model_specific_args(parent_parser) + # parser = parent_parser.add_argument_group("simsiam") + + # # projector + # parser.add_argument("--proj_output_dim", type=int, default=128) + # parser.add_argument("--proj_hidden_dim", type=int, default=2048) + + # # predictor + # parser.add_argument("--pred_hidden_dim", type=int, default=512) + return parent_parser + + @property + def learnable_params(self) -> List[dict]: + """Adds projector and predictor parameters to the parent's learnable parameters. + + Returns: + List[dict]: list of learnable parameters. + """ + + # extra_learnable_params: List[dict] = [ + # {"params": self.projector.parameters()}, + # {"params": self.predictor.parameters(), "static_lr": True}, + # ] + return super().learnable_params + + def forward(self, X: torch.Tensor, *args, **kwargs) -> Dict[str, Any]: + """Performs the forward pass of the backbone, the projector and the predictor. + + Args: + X (torch.Tensor): a batch of images in the tensor format. + + Returns: + Dict[str, Any]: + a dict containing the outputs of the parent + and the projected and predicted features. + """ + return self.base_forward(*args, **kwargs) + + + def base_forward(self, X: torch.Tensor) -> Dict: + """Basic forward that allows children classes to override forward(). + + Args: + X (torch.Tensor): batch of images in tensor format. + + Returns: + Dict: dict of logits and features. + """ + + feats = self.backbone(X) + logits = self.classifier(feats) + return { + "logits": logits, + "feats": feats, + } + + def training_step(self, batch: Sequence[Any], batch_idx: int) -> torch.Tensor: + """Training step for SimSiam reusing BaseMethod training step. + + Args: + batch (Sequence[Any]): a batch of data in the format of [img_indexes, [X], Y], where + [X] is a list of size num_crops containing batches of images + batch_idx (int): index of the batch + + Returns: + torch.Tensor: total loss composed of SimSiam loss and classification loss + """ + + out = super().training_step(batch, batch_idx) + class_loss = out["loss"] + + return class_loss diff --git a/sweep_poison.sh b/sweep_poison.sh index 2a7a988..5500eca 100644 --- a/sweep_poison.sh +++ b/sweep_poison.sh @@ -1,5 +1,8 @@ -meg_out=/home/b17611136518/solo-learn-data -lin_out=/data/yfwang/solo-learn +# out=/home/b17611136518/solo-learn-data +out_dir=/data/yfwang/solo-learn +dataset=cifar10 +poison_model=simclr +rate=0.6 # done # sweep poisoning rate @@ -15,17 +18,15 @@ lin_out=/data/yfwang/solo-learn # done # done -# poison_model=simclr -# poison_data= -# for method in swav supcon mocov2plus simsiam byol barlow dino do -# for file in zoo/trained_models/$dataset/simclr/*.ckpt -# do -# for rate in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 -# do -# python main_poison.py --dataset $dataset --backbone resnet18 --data_dir bash_files/pretrain/cifar/datasets --optimizer sgd --save_dir /data/yfwang/solo-learn/poison_datasets --pretrained_feature_extractor ${file} --poison_rate $rate --pretrain_method zoo-simclr --trigger_type gaussian_noise --trigger_alpha 0.2 -# done -# done -# done +for method in supcon mocov2plus simsiam byol barlow dino do + for file in zoo/trained_models/$dataset/$method/*.ckpt + do + for rate in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + do + python main_poison.py --dataset $dataset --backbone resnet18 --data_dir bash_files/pretrain/cifar/datasets --optimizer sgd --save_dir $out_dir/poison_datasets --pretrained_feature_extractor ${file} --poison_rate $rate --pretrain_method zoo-simclr --trigger_type gaussian_noise --trigger_alpha 0.2 + done + done +done # for model in swav supcon mocov2plus simsiam byol barlow dino do # for file in zoo/trained_models/$dataset/${model}/*.ckpt do @@ -34,13 +35,13 @@ lin_out=/data/yfwang/solo-learn # done -dataset=cifar10 +# dataset=cifar10 -for trigger_type in checkerboard_1corner checkerboard_4corner checkerboard_center checkerboard_full gaussian_noise - do - for file in zoo/trained_models/$dataset/simclr/*.ckpt - do - python main_poison.py --dataset ${dataset} --backbone resnet18 --data_dir bash_files/pretrain/cifar/datasets --optimizer sgd --save_dir ${meg_out}/poison_datasets --pretrained_feature_extractor ${file} --poison_rate 0.6 --pretrain_method zoo-simclr --trigger_type $trigger_type --trigger_alpha 0.2 - done -done +# for trigger_type in checkerboard_1corner checkerboard_4corner checkerboard_center checkerboard_full gaussian_noise +# do +# for file in zoo/trained_models/$dataset/simclr/*.ckpt +# do +# python main_poison.py --dataset ${dataset} --backbone resnet18 --data_dir bash_files/pretrain/cifar/datasets --optimizer sgd --save_dir ${meg_out}/poison_datasets --pretrained_feature_extractor ${file} --poison_rate 0.6 --pretrain_method zoo-simclr --trigger_type $trigger_type --trigger_alpha 0.2 +# done +# done # for trigger_type