-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Classifiers training code were added
- Loading branch information
1 parent
789e138
commit 31c5133
Showing
13 changed files
with
1,978 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import os | ||
|
||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||
os.environ['CUDA_VISIBLE_DEVICES'] = '5' | ||
from torch import nn | ||
import torch.nn.functional as F | ||
from transformers import Trainer | ||
|
||
N_LABELS = 28 | ||
KL_FACTOR = 0.95 | ||
|
||
|
||
class KLTrainer(Trainer): | ||
def compute_loss(self, model, inputs, return_outputs=False): | ||
input_ids = inputs.get("input_ids") | ||
attention_mask = inputs.get("attention_mask") | ||
counterfactual_input_ids = inputs.get("counterfactual_input_ids") | ||
counterfactual_attention_mask = inputs.get("counterfactual_attention_mask") | ||
labels = inputs.get("labels") | ||
ce_loss_fct = nn.CrossEntropyLoss() | ||
|
||
hard_text_outputs = model(input_ids=input_ids, attention_mask=attention_mask) | ||
hard_text_logits = hard_text_outputs.get('logits') | ||
hard_text_cel = ce_loss_fct(hard_text_logits.view(-1, N_LABELS), labels.view(-1)) | ||
if counterfactual_input_ids is None: | ||
return (hard_text_cel, hard_text_outputs) if return_outputs else hard_text_cel | ||
|
||
counterfactual_outputs = model(input_ids=counterfactual_input_ids, attention_mask=counterfactual_attention_mask) | ||
counterfactual_logits = counterfactual_outputs.get('logits') | ||
counterfactual_cel = ce_loss_fct(counterfactual_logits.view(-1, N_LABELS), labels.view(-1)) | ||
|
||
kl_loss_fct = nn.KLDivLoss(reduction='batchmean') | ||
kl_loss_left = kl_loss_fct(F.log_softmax(hard_text_logits, dim=1), F.softmax(counterfactual_logits, dim=1)) | ||
kl_loss_right = kl_loss_fct(F.log_softmax(counterfactual_logits, dim=1), F.softmax(hard_text_logits, dim=1)) | ||
# loss = (1 - KL_FACTOR) * (hard_text_cel + counterfactual_cel) + KL_FACTOR * (kl_loss_left + kl_loss_right) | ||
loss = (1-KL_FACTOR) * hard_text_cel + KL_FACTOR * (kl_loss_left + kl_loss_right) | ||
return (loss, hard_text_outputs) if return_outputs else loss |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import os | ||
|
||
from transformers.modeling_outputs import SequenceClassifierOutput | ||
|
||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||
os.environ['CUDA_VISIBLE_DEVICES'] = '3' | ||
|
||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
from transformers import RobertaForSequenceClassification, RobertaConfig | ||
import wandb | ||
|
||
KL_FACTOR = 0.75 | ||
|
||
|
||
class RobertaWithCounterfactualKL(RobertaForSequenceClassification): | ||
def __init__(self, config): | ||
wandb.log({'KL_FACTOR': KL_FACTOR}) | ||
wandb.log({'NO counterfactual CEL': True}) | ||
super().__init__(config) | ||
|
||
def forward( | ||
self, | ||
input_ids=None, | ||
attention_mask=None, | ||
counterfactual_input_ids=None, | ||
counterfactual_attention_mask=None, | ||
labels=None, | ||
**kwargs | ||
): | ||
hard_text_output = super().forward( | ||
input_ids=input_ids, | ||
attention_mask=attention_mask, | ||
labels=labels, | ||
) | ||
return hard_text_output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
import os | ||
import pickle | ||
|
||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||
os.environ['CUDA_VISIBLE_DEVICES'] = '1' | ||
|
||
from datasets import load_metric | ||
from transformers import AutoTokenizer, Trainer, TrainingArguments, EvalPrediction, RobertaForSequenceClassification, \ | ||
RobertaConfig | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
import torch | ||
# import spacy | ||
import datasets | ||
from datetime import datetime | ||
import wandb | ||
from pathlib import Path | ||
from sklearn.preprocessing import LabelEncoder | ||
|
||
EXP_NAME = 'decoded_no_inter' | ||
MAX_LENGTH = 64 | ||
SEED = 0 | ||
IS_FIRST = True | ||
|
||
file_name = Path(__file__).name | ||
wandb.init(project=file_name) | ||
|
||
|
||
def log_print(str_to_print): | ||
eval_str_to_print = eval(str_to_print) | ||
if type(eval_str_to_print) == list: | ||
eval_str_to_print = '\n'.join(eval_str_to_print) | ||
print(f"{str_to_print}: {eval_str_to_print}") | ||
|
||
|
||
# # Load data | ||
with open("/home/nlp/matan_avitan/git/vec2text_inter/bios_data/bios_dev.pickle", "rb") as f: | ||
validation_df = pd.DataFrame(pickle.load(f)) | ||
|
||
ot_females_to_males_path = f'bios_extracted_data/ot_females_to_males.csv' | ||
ot_males_to_females_path = f'bios_extracted_data/ot_males_to_females.csv' | ||
|
||
ot_females_to_males_df = pd.read_csv(ot_females_to_males_path) | ||
ot_males_to_females_df = pd.read_csv(ot_males_to_females_path) | ||
|
||
ot_stacked = pd.concat([ot_females_to_males_df, ot_males_to_females_df]).reset_index(drop=True) | ||
|
||
labels = list(ot_stacked['p'].unique()) | ||
id2label = {i: l for i, l in enumerate(labels)} | ||
label2id = {l: i for i, l in enumerate(labels)} | ||
|
||
formatted_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
# Define the directory where the output/results will be saved | ||
output_dir = f'/home/nlp/matan_avitan/tmp/vec2text_inter/trained_models/{EXP_NAME}_classifier_{formatted_datetime}' | ||
# Define the specific pre-trained model to be used | ||
model_name = "FacebookAI/roberta-base" | ||
# Init config with labels | ||
config = RobertaConfig.from_pretrained(model_name, label2id=label2id, id2label=id2label, num_labels=len(labels)) | ||
# Retrieve the tokenizer component from the pipeline object | ||
tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
# Clear the CUDA cache to free up GPU memory | ||
torch.cuda.empty_cache() | ||
# Retrieve the model component from the pipeline object | ||
model = RobertaForSequenceClassification.from_pretrained(model_name, | ||
config=config) # Adding the labels dicts is equivalent to num_labels=3, and therfore init a classifier head. | ||
|
||
num_labels = len(label2id) # Number of classes in your classification task | ||
|
||
|
||
# # Create/Load data | ||
|
||
|
||
def create_input_sequence(sample): | ||
features = sample['transformed_hard_text'] | ||
labels = sample['labels'] | ||
# Encoding the sequence using the tokenizer | ||
encoded_sequence = tokenizer(features, truncation=True, padding='max_length', max_length=MAX_LENGTH, | ||
return_tensors='pt') | ||
# Decode the input_ids | ||
encoded_sequence['input_sentence'] = tokenizer.batch_decode(encoded_sequence.input_ids) | ||
# Assign label to the encoded sequence | ||
encoded_sequence['labels'] = labels | ||
return encoded_sequence | ||
|
||
|
||
label_encoder = LabelEncoder() | ||
ot_stacked['labels'] = label_encoder.fit_transform(ot_stacked['p']) | ||
validation_df['labels'] = label_encoder.transform(validation_df['p']) | ||
validation_df = validation_df.rename({'hard_text': 'transformed_hard_text'}, axis=1) | ||
|
||
# Define id2label and label2id mappings | ||
id2label = {i: label for i, label in enumerate(label_encoder.classes_)} | ||
label2id = {label: i for i, label in enumerate(label_encoder.classes_)} | ||
wandb.log({'labels': list(id2label.values())}) | ||
|
||
# ot_stacked['labels'] = ot_stacked['p'] | ||
train_df = ot_stacked | ||
train_ds = datasets.Dataset.from_pandas(train_df) | ||
validation_ds = datasets.Dataset.from_pandas(validation_df) | ||
len(train_ds), len(validation_ds), train_ds | ||
|
||
if IS_FIRST: | ||
features = ['transformed_hard_text'] | ||
labels = ['labels'] | ||
columns_to_preserve = features + labels + ['g'] | ||
columns_to_remove = list(set(train_df.columns) - set(columns_to_preserve)) | ||
|
||
train_ds = train_ds.map(create_input_sequence, batched=True, batch_size=1, remove_columns=columns_to_remove, | ||
num_proc=16) | ||
train_ds.save_to_disk(f'{EXP_NAME}_train_dataset') | ||
columns_to_remove = list(set(validation_df.columns) - set(columns_to_preserve)) | ||
validation_ds = validation_ds.map(create_input_sequence, batched=True, batch_size=1, | ||
remove_columns=columns_to_remove, | ||
num_proc=16) | ||
validation_ds.save_to_disk(f'{EXP_NAME}_validation_dataset') | ||
else: | ||
train_ds = datasets.load_from_disk(f'{EXP_NAME}_train_dataset') | ||
validation_ds = datasets.load_from_disk(f'{EXP_NAME}_validation_dataset') | ||
|
||
|
||
def calculate_tpr(y_pred, y_true, z_true): | ||
rms_tpr_gap = 0.0 | ||
for y in set(y_pred): | ||
tpr_1 = (((y_pred[y_true == y])[z_true[y_true == y] == 1]) == y).mean() | ||
tpr_0 = (((y_pred[y_true == y])[z_true[y_true == y] == 0]) == y).mean() | ||
rms_tpr_gap += (tpr_1 - tpr_0) ** 2 | ||
|
||
rms_tpr_gap = np.sqrt(rms_tpr_gap / len(set(y_pred))) | ||
return rms_tpr_gap | ||
|
||
|
||
def compute_metrics(p: EvalPrediction): | ||
# Extracting predictions from EvalPrediction object | ||
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions | ||
|
||
# Obtaining the predicted classes | ||
preds = np.argmax(preds, axis=1) | ||
|
||
# Calculating the ratio of predictions equal to 0 (arbitrary label) | ||
ratio = np.mean(preds == 0) | ||
|
||
# Dictionary to store computed metrics | ||
result = {} | ||
|
||
# Loading evaluation metrics | ||
metric_f1 = load_metric("f1") | ||
metric_precision = load_metric("precision") | ||
metric_recall = load_metric("recall") | ||
metric_acc = load_metric("accuracy") | ||
|
||
# Computing various metrics | ||
result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"] | ||
result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')[ | ||
'precision'] | ||
result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"] | ||
result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"] | ||
result["ratio"] = ratio | ||
|
||
return result | ||
|
||
|
||
training_args = TrainingArguments( | ||
output_dir=output_dir, # Output directory | ||
logging_dir=output_dir + "/logs", # Output directory for logging | ||
num_train_epochs=30, # Total number of training epochs | ||
per_device_train_batch_size=512, # Batch size per device during training | ||
per_device_eval_batch_size=2048, # Batch size for evaluation | ||
weight_decay=0.01, # Strength of weight decay | ||
gradient_accumulation_steps=1, # The number of steps whose gradients are accumulated | ||
learning_rate=2e-5, # Controls the magnitude of updates to the model weights | ||
warmup_ratio=0.06, # Represents the proportion of training steps | ||
label_smoothing_factor=0.05, # Regularization technique to prevent the model from becoming overconfident | ||
evaluation_strategy='steps', # Frequency or timing of evaluating | ||
logging_strategy='steps', # Frequency or timing of logging | ||
logging_steps=50, # Frequency or timing of logging | ||
eval_steps=50, # Frequency or timing of evaluating | ||
logging_first_step=True, | ||
fp16=True, | ||
do_eval=True, | ||
report_to="wandb", | ||
save_total_limit=5, | ||
load_best_model_at_end=True, | ||
save_steps=50 | ||
# eval_accumulation_steps=16 | ||
) | ||
|
||
trainer = Trainer( | ||
model=model, # The instantiated model to be trained | ||
args=training_args, # Training arguments, defined above | ||
compute_metrics=compute_metrics, # A function to compute the metrics | ||
train_dataset=train_ds, # Training dataset | ||
eval_dataset=validation_ds # Evaluation dataset | ||
) | ||
|
||
print(trainer.args.device) | ||
|
||
trainer.evaluate() | ||
|
||
torch.cuda.empty_cache() | ||
|
||
trainer.train() | ||
|
||
trainer.evaluate() | ||
|
||
model.eval() | ||
|
||
Path(output_dir + "/modeldir_new").mkdir() | ||
# Save the state_dict of your custom model | ||
torch.save(model.state_dict(), output_dir + "/modeldir_new/model.pth") | ||
|
||
# Save the tokenizer associated with the model | ||
tokenizer.save_pretrained(output_dir + "/modeldir_new/tokenizer") | ||
|
||
trainer.save_model(output_dir + "/modeldir_new/trainer_model") |
Oops, something went wrong.