Skip to content

Commit

Permalink
properly do cv
Browse files Browse the repository at this point in the history
  • Loading branch information
csinva committed Jan 24, 2023
1 parent 1fa3854 commit e425569
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 110 deletions.
33 changes: 18 additions & 15 deletions experiments/01_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,19 @@
import torch
import pickle as pkl

from project_name.model import DecisionTreeClassifier
import project_name.model
import cache_save_utils
import data


def fit_model(model, X_train, y_train, X_test, y_test, feature_names, r):
def fit_model(model, X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names, r):
model.fit(X_train, y_train)

r['acc_cv'] = model.score(X_cv, y_cv)
evaluate_model(model, X_test, y_test, r)

return r


def evaluate_model(model, X_test, y_test, r):
r['test_acc'] = model.score(X_test, y_test)
r['acc_test'] = model.score(X_test, y_test)
return r


Expand All @@ -42,12 +40,16 @@ def add_main_args(parser):
# training misc args
parser.add_argument('--seed', type=int, default=1,
help='random seed')
parser.add_argument('--save_dir', type=str, default='tmp',
parser.add_argument('--save_dir', type=str, default='results',
help='directory for saving')

# model args
parser.add_argument('--model_name', type=str, choices=['decision_tree', 'ridge'],
default='decision_tree', help='name of model')
parser.add_argument('--alpha', type=float, default=1,
help='regularization strength')
parser.add_argument('--max_depth', type=int,
default=2, help='max depth of tree')
return parser

def add_computational_args(parser):
Expand All @@ -67,31 +69,32 @@ def add_computational_args(parser):
# set up logging
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
for k in sorted(vars(args)):
logger.info('\t' + k + ' ' + str(vars(args)[k]))

# set up saving directory + check for cache
already_cached, save_dir = cache_save_utils.get_save_dir_unique(
parser, parser_without_computational_args, args, args.save_dir)
logging.info(f'\n\nsaving to ' + save_dir)

if args.use_cache and already_cached:
logging.info(
f'cached version exists!\nsuccessfully skipping :)\n\n\n')
f'cached version exists! Successfully skipping :)\n\n\n')
exit(0)
for k in sorted(vars(args)):
logger.info('\t' + k + ' ' + str(vars(args)[k]))
logging.info(f'\n\nsaving to ' + save_dir)

# set seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
random.seed(args.seed)

# load data
dset, dataset_key_text = data.load_dataset(
dset, dataset_key_text = data.load_huggingface_dataset(
dataset_name=args.dataset_name, subsample_frac=args.subsample_frac)
X_train, y_train, X_test, y_test, feature_names = data.convert_text_data_to_counts_array(
X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names = data.convert_text_data_to_counts_array(
dset, dataset_key_text)

# load model
model = DecisionTreeClassifier()
model = project_name.model.get_model(args)

# set up saving dictionary + save params file
r = defaultdict(list)
Expand All @@ -100,7 +103,7 @@ def add_computational_args(parser):
args=args, save_dir=save_dir, fname='params.json', r=r)

# fit
r = fit_model(model, X_train, y_train, X_test, y_test, feature_names, r)
r = fit_model(model, X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names, r)

# save results
pkl.dump(r, open(join(save_dir, 'results.pkl'), 'wb'))
Expand Down
13 changes: 5 additions & 8 deletions experiments/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

def load_dataset(dataset_name, test_size=0.33, subsample_frac=1.0):
"""Load dataset + return the relevant dataset key
def load_huggingface_dataset(dataset_name, subsample_frac=1.0):
"""Load text dataset from huggingface (with train/vlidation spltis) + return the relevant dataset key
"""
# load dset
if dataset_name == 'tweet_eval':
Expand All @@ -18,7 +18,7 @@ def load_dataset(dataset_name, test_size=0.33, subsample_frac=1.0):
train = datasets.load_dataset('financial_phrasebank', 'sentences_75agree',
revision='main', split='train')
idxs_train, idxs_val = train_test_split(
np.arange(len(train)), test_size=test_size, random_state=13)
np.arange(len(train)), test_size=0.33, random_state=13)
dset = datasets.DatasetDict()
dset['train'] = train.select(idxs_train)
dset['validation'] = train.select(idxs_val)
Expand All @@ -35,10 +35,6 @@ def load_dataset(dataset_name, test_size=0.33, subsample_frac=1.0):
del dset['unsupervised']
dset['validation'] = dset['test']

# delete test dset
if 'test' in dset:
del dset['test']

# subsample data
if subsample_frac > 0:
dset['train'] = dset['train'].select(range(int(subsample_frac * len(dset['train']))))
Expand All @@ -51,4 +47,5 @@ def convert_text_data_to_counts_array(dset, dataset_key_text):
X_test = v.transform(dset['validation'][dataset_key_text])
y_test = dset['validation']['label']
feature_names = v.get_feature_names_out().tolist()
return X_train, y_train, X_test, y_test, feature_names
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, random_state=13)
return X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names
12 changes: 12 additions & 0 deletions notebooks/matplotlibrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
figure.autolayout : True

axes.titlesize : 20
axes.labelsize : 15 ## fontsize of the x any y labels

font.size : 15

xtick.labelsize : 15 ## fontsize of the tick labels
ytick.labelsize : 15 ## fontsize of the tick labels

axes.spines.top : False
axes.spines.right : False
12 changes: 10 additions & 2 deletions project_name/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Logistic

# This is where we would implement our custom model
# This is where we would implement our custom model

def get_model(args):
if args.model_name == 'decision_tree':
model = DecisionTreeClassifier(max_depth=args.max_depth)
elif args.model_name == 'ridge':
model = RidgeClassifier(alpha=args.alpha)
else:
raise ValueError('Invalid model_name: {}'.format(args.model_name))
return model
23 changes: 12 additions & 11 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
This is an evolving repo optimized for machine-learning projects aimed at designing a new algorithm. They require sweeping over different hyperparameters, comparing to baselines, and iteratively refining an algorithm.
This is an evolving repo optimized for machine-learning projects aimed at designing a new algorithm. They require sweeping over different hyperparameters, comparing to baselines, and iteratively refining an algorithm. Based of [cookiecutter-data-science](https://github.com/drivendata/cookiecutter-data-science).

# Organization
- `project_name`: to be renamed, contains main code for modeling (e.g. model architecture)
- `experiments`: contains code for runnning experiments (e.g. loading data, training models, evaluating models)
- `scripts`: contains scripts for running experiments (e.g. python scripts that launch jobs in `experiments` folder with different hyperparams)
- `notebooks`: contains jupyter notebooks for analyzing results, errors, and making figures
- `project_name`: should be renamed, contains main code for modeling (e.g. model architecture)
- `experiments`: code for runnning experiments (e.g. loading data, training models, evaluating models)
- `scripts`: scripts for running experiments (e.g. python scripts that launch jobs in `experiments` folder with different hyperparams)
- `notebooks`: jupyter notebooks for analyzing results and making figures

# Setup
- first, rename `project_name` to your project name and modify `setup.py` accordingly
- clone and run `pip install -e .`, resulting in a package named `project_name` that can be imported
- first, rename `project_name` to your project name and modify `setup.py` accordingly
- example run: run `python scripts/01_train_models.py` then load the results in `notebooks/01_model_results.ipynb`
- example run: run `python scripts/01_train_models.py` (which calls `experiments/01_train_model.py` then view the results in `notebooks/01_model_results.ipynb`

# Features
- scripts sweep over hyperparameters using easy-to-specify python code
- experiments automatically cache runs that have already completed
- caching uses the (**non-default**) arguments in the argparse namespace
- notebooks can easily evaluate results aggregated over multiple experiments using pandas
- binary arguments should start with the word "use" (e.g. `--use_caching`) and take values 0 or 1

# Guidelines
- Huggingface whenever possible, then pytorch
- See some useful packages [here](https://csinva.io/blog/misc/ml_coding_tips)
- Avoid notebooks whenever possible (ideally, only for analyzing results, making figures)
- Paths should be specified relative to a file's location (e.g. `os.path.join(os.path.dirname(__file__), 'data')`)
- Naming variables: use the main thing first followed by the modifiers (e.g. `X_train`, `acc_test`)
- binary arguments should start with the word "use" (e.g. `--use_caching`) and take values 0 or 1
- Use logging instead of print
- Use argparse and sweep over hyperparams using python scripts (or [amulet](https://amulet-docs.azurewebsites.net/main/index.html))
- Note, arguments get passed as strings so shouldn't pass args that aren't primitives or a list of primitives (more complex structures should be handled in the experiments code)
- Each run should save a single pickle file of its results
- Everything should run end-to-end with one script (caching things along the way)
- All experiments that depend on each other should run end-to-end with one script (caching things along the way)
- Keep an updated requirements.txt (required for amulet)
- Follow sklearn apis whenever possible
- Follow sklearn apis whenever possible
- Use Huggingface whenever possible, then pytorch
36 changes: 19 additions & 17 deletions scripts/01_train_linear_models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import submit_utils
from os.path import dirname
import os.path
repo_dir = dirname(dirname(os.path.abspath(__file__)))

# Showcasing different ways to sweep over arguments
# Can pass any empty dict for any of these to avoid sweeping

# List of values to sweep over (sweeps over all combinations of these)
params_shared_dict = {
'seed': [1, 2],
'save_dir': ['tmp'],
'lr': [0.1, 0.01],
'save_dir': ['results'],
'use_cache': [0], # pass binary values with 0/1 instead of the ambiguous strings True/False
}

# List of tuples to sweep over (these values are coupled, and swept over together)
Expand All @@ -15,22 +21,18 @@
],
('model_name', 'max_depth'): [
('decision_tree', i)
for i in range(1, 4)
for i in range(2, 4)
],
}
# print(params_coupled_dict.keys())

# FREEZE PARAMS
# (x, xvalue)....

# IMPOSSIBLE PAIRINGS...

# If you want to couple long things, you would need to duplicate and modify this script
# (e.g. decision trees and linear models have very different params, so each would have a separate script)

submit_utils.run_dicts(
params_shared_dict,
params_coupled_dict,
script_name='03_train_prefix.py',
actually_run=False
# Args list is a list of dictionaries
# If you want to do something special to remove some of these runs, can remove them before calling run_args_list
args_list = submit_utils.get_args_list(
params_shared_dict=params_shared_dict,
params_coupled_dict=params_coupled_dict,
)
submit_utils.run_args_list(
args_list,
script_name=os.path.join(repo_dir, 'experiments', '01_train_model.py'),
actually_run=True,
)
Loading

0 comments on commit e425569

Please sign in to comment.