properly do cv

csinva · Jan 24, 2023 · e425569 · e425569
1 parent 1fa3854
commit e425569
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 110 deletions.
diff --git a/experiments/01_train_model.py b/experiments/01_train_model.py
@@ -8,21 +8,19 @@
 import torch
 import pickle as pkl
 
-from project_name.model import DecisionTreeClassifier
+import project_name.model
 import cache_save_utils
 import data
 
 
-def fit_model(model, X_train, y_train, X_test, y_test, feature_names, r):
+def fit_model(model, X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names, r):
     model.fit(X_train, y_train)
-
+    r['acc_cv'] = model.score(X_cv, y_cv)
     evaluate_model(model, X_test, y_test, r)
-
     return r
 
-
 def evaluate_model(model, X_test, y_test, r):
-    r['test_acc'] = model.score(X_test, y_test)
+    r['acc_test'] = model.score(X_test, y_test)
     return r
 
 
@@ -42,12 +40,16 @@ def add_main_args(parser):
         # training misc args
         parser.add_argument('--seed', type=int, default=1,
                             help='random seed')
-        parser.add_argument('--save_dir', type=str, default='tmp',
+        parser.add_argument('--save_dir', type=str, default='results',
                             help='directory for saving')
 
         # model args
         parser.add_argument('--model_name', type=str, choices=['decision_tree', 'ridge'],
                             default='decision_tree', help='name of model')
+        parser.add_argument('--alpha', type=float, default=1,
+                            help='regularization strength')
+        parser.add_argument('--max_depth', type=int,
+                            default=2, help='max depth of tree')
         return parser
 
     def add_computational_args(parser):
@@ -67,31 +69,32 @@ def add_computational_args(parser):
     # set up logging
     logger = logging.getLogger()
     logging.basicConfig(level=logging.INFO)
-    for k in sorted(vars(args)):
-        logger.info('\t' + k + ' ' + str(vars(args)[k]))
 
     # set up saving directory + check for cache
     already_cached, save_dir = cache_save_utils.get_save_dir_unique(
         parser, parser_without_computational_args, args, args.save_dir)
-    logging.info(f'\n\nsaving to ' + save_dir)
+
     if args.use_cache and already_cached:
         logging.info(
-            f'cached version exists!\nsuccessfully skipping :)\n\n\n')
+            f'cached version exists! Successfully skipping :)\n\n\n')
         exit(0)
+    for k in sorted(vars(args)):
+        logger.info('\t' + k + ' ' + str(vars(args)[k]))
+    logging.info(f'\n\nsaving to ' + save_dir)
 
     # set seed
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
     random.seed(args.seed)
 
     # load data
-    dset, dataset_key_text = data.load_dataset(
+    dset, dataset_key_text = data.load_huggingface_dataset(
         dataset_name=args.dataset_name, subsample_frac=args.subsample_frac)
-    X_train, y_train, X_test, y_test, feature_names = data.convert_text_data_to_counts_array(
+    X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names = data.convert_text_data_to_counts_array(
         dset, dataset_key_text)
 
     # load model
-    model = DecisionTreeClassifier()
+    model = project_name.model.get_model(args)
 
     # set up saving dictionary + save params file
     r = defaultdict(list)
@@ -100,7 +103,7 @@ def add_computational_args(parser):
         args=args, save_dir=save_dir, fname='params.json', r=r)
 
     # fit
-    r = fit_model(model, X_train, y_train, X_test, y_test, feature_names, r)
+    r = fit_model(model, X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names, r)
 
     # save results
     pkl.dump(r, open(join(save_dir, 'results.pkl'), 'wb'))

diff --git a/experiments/data.py b/experiments/data.py
@@ -8,8 +8,8 @@
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import CountVectorizer
 
-def load_dataset(dataset_name, test_size=0.33, subsample_frac=1.0):
-    """Load dataset + return the relevant dataset key
+def load_huggingface_dataset(dataset_name, subsample_frac=1.0):
+    """Load text dataset from huggingface (with train/vlidation spltis) + return the relevant dataset key
     """
     # load dset
     if dataset_name == 'tweet_eval':
@@ -18,7 +18,7 @@ def load_dataset(dataset_name, test_size=0.33, subsample_frac=1.0):
         train = datasets.load_dataset('financial_phrasebank', 'sentences_75agree',
                                       revision='main', split='train')
         idxs_train, idxs_val = train_test_split(
-            np.arange(len(train)), test_size=test_size, random_state=13)
+            np.arange(len(train)), test_size=0.33, random_state=13)
         dset = datasets.DatasetDict()
         dset['train'] = train.select(idxs_train)
         dset['validation'] = train.select(idxs_val)
@@ -35,10 +35,6 @@ def load_dataset(dataset_name, test_size=0.33, subsample_frac=1.0):
         del dset['unsupervised']
         dset['validation'] = dset['test']
 
-    # delete test dset
-    if 'test' in dset:
-        del dset['test']
-
     # subsample data
     if subsample_frac > 0:
         dset['train'] = dset['train'].select(range(int(subsample_frac * len(dset['train']))))
@@ -51,4 +47,5 @@ def convert_text_data_to_counts_array(dset, dataset_key_text):
     X_test = v.transform(dset['validation'][dataset_key_text])
     y_test = dset['validation']['label']
     feature_names = v.get_feature_names_out().tolist()
-    return X_train, y_train, X_test, y_test, feature_names
+    X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, random_state=13)
+    return X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names
diff --git a/notebooks/matplotlibrc b/notebooks/matplotlibrc
@@ -0,0 +1,12 @@
+figure.autolayout : True
+
+axes.titlesize : 20
+axes.labelsize      : 15  ## fontsize of the x any y labels
+
+font.size           : 15
+
+xtick.labelsize      : 15 ## fontsize of the tick labels
+ytick.labelsize      : 15 ## fontsize of the tick labels
+
+axes.spines.top : False
+axes.spines.right : False
diff --git a/project_name/model.py b/project_name/model.py
@@ -1,5 +1,13 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import RidgeClassifier
-from sklearn.linear_model import Logistic
 
-# This is where we would implement our custom model
+# This is where we would implement our custom model
+
+def get_model(args):
+    if args.model_name == 'decision_tree':
+        model = DecisionTreeClassifier(max_depth=args.max_depth)
+    elif args.model_name == 'ridge':
+        model = RidgeClassifier(alpha=args.alpha)
+    else:
+        raise ValueError('Invalid model_name: {}'.format(args.model_name))
+    return model
diff --git a/readme.md b/readme.md
@@ -1,32 +1,33 @@
-This is an evolving repo optimized for machine-learning projects aimed at designing a new algorithm. They require sweeping over different hyperparameters, comparing to baselines, and iteratively refining an algorithm.
+This is an evolving repo optimized for machine-learning projects aimed at designing a new algorithm. They require sweeping over different hyperparameters, comparing to baselines, and iteratively refining an algorithm. Based of [cookiecutter-data-science](https://github.com/drivendata/cookiecutter-data-science).
 
 # Organization
-- `project_name`: to be renamed, contains main code for modeling (e.g. model architecture)
-- `experiments`: contains code for runnning experiments (e.g. loading data, training models, evaluating models)
-- `scripts`: contains scripts for running experiments (e.g. python scripts that launch jobs in `experiments` folder with different hyperparams)
-- `notebooks`: contains jupyter notebooks for analyzing results, errors, and making figures
+- `project_name`: should be renamed, contains main code for modeling (e.g. model architecture)
+- `experiments`: code for runnning experiments (e.g. loading data, training models, evaluating models)
+- `scripts`: scripts for running experiments (e.g. python scripts that launch jobs in `experiments` folder with different hyperparams)
+- `notebooks`: jupyter notebooks for analyzing results and making figures
 
 # Setup
+- first, rename `project_name` to your project name and modify `setup.py` accordingly
 - clone and run `pip install -e .`, resulting in a package named `project_name` that can be imported
-    - first, rename `project_name` to your project name and modify `setup.py` accordingly
-- example run: run `python scripts/01_train_models.py` then load the results in `notebooks/01_model_results.ipynb`
+- example run: run `python scripts/01_train_models.py` (which calls `experiments/01_train_model.py` then view the results in `notebooks/01_model_results.ipynb`
 
 # Features
 - scripts sweep over hyperparameters using easy-to-specify python code
 - experiments automatically cache runs that have already completed
     - caching uses the (**non-default**) arguments in the argparse namespace
 - notebooks can easily evaluate results aggregated over multiple experiments using pandas
-- binary arguments should start with the word "use" (e.g. `--use_caching`) and take values 0 or 1
 
 # Guidelines
-- Huggingface whenever possible, then pytorch
 - See some useful packages [here](https://csinva.io/blog/misc/ml_coding_tips)
 - Avoid notebooks whenever possible (ideally, only for analyzing results, making figures)
 - Paths should be specified relative to a file's location (e.g. `os.path.join(os.path.dirname(__file__), 'data')`)
+- Naming variables: use the main thing first followed by the modifiers (e.g. `X_train`, `acc_test`)
+    - binary arguments should start with the word "use" (e.g. `--use_caching`) and take values 0 or 1
 - Use logging instead of print
 - Use argparse and sweep over hyperparams using python scripts (or [amulet](https://amulet-docs.azurewebsites.net/main/index.html))
     - Note, arguments get passed as strings so shouldn't pass args that aren't primitives or a list of primitives (more complex structures should be handled in the experiments code)
 - Each run should save a single pickle file of its results
-- Everything should run end-to-end with one script (caching things along the way)
+- All experiments that depend on each other should run end-to-end with one script (caching things along the way)
 - Keep an updated requirements.txt (required for amulet)
-- Follow sklearn apis whenever possible
+- Follow sklearn apis whenever possible
+- Use Huggingface whenever possible, then pytorch
diff --git a/scripts/01_train_linear_models.py b/scripts/01_train_linear_models.py
@@ -1,10 +1,16 @@
 import submit_utils
+from os.path import dirname
+import os.path
+repo_dir = dirname(dirname(os.path.abspath(__file__)))
+
+# Showcasing different ways to sweep over arguments
+# Can pass any empty dict for any of these to avoid sweeping
 
 # List of values to sweep over (sweeps over all combinations of these)
 params_shared_dict = {
     'seed': [1, 2],
-    'save_dir': ['tmp'],
-    'lr': [0.1, 0.01],
+    'save_dir': ['results'],
+    'use_cache': [0], # pass binary values with 0/1 instead of the ambiguous strings True/False
 }
 
 # List of tuples to sweep over (these values are coupled, and swept over together)
@@ -15,22 +21,18 @@
     ],
     ('model_name', 'max_depth'): [
         ('decision_tree', i)
-        for i in range(1, 4)
+        for i in range(2, 4)
     ],
 }
-# print(params_coupled_dict.keys())
-
-# FREEZE PARAMS 
-# (x, xvalue)....
-
-# IMPOSSIBLE PAIRINGS...
 
-# If you want to couple long things, you would need to duplicate and modify this script
-# (e.g. decision trees and linear models have very different params, so each would have a separate script)
-
-submit_utils.run_dicts(
-    params_shared_dict,
-    params_coupled_dict,
-    script_name='03_train_prefix.py',
-    actually_run=False
+# Args list is a list of dictionaries
+# If you want to do something special to remove some of these runs, can remove them before calling run_args_list
+args_list = submit_utils.get_args_list(
+    params_shared_dict=params_shared_dict,
+    params_coupled_dict=params_coupled_dict,
+)
+submit_utils.run_args_list(
+    args_list,
+    script_name=os.path.join(repo_dir, 'experiments', '01_train_model.py'),
+    actually_run=True,
 )