diff --git a/.gitignore b/.gitignore index 31a93e1f..5296d863 100644 --- a/.gitignore +++ b/.gitignore @@ -160,5 +160,7 @@ cython_debug/ #.idea/ .vscode/ -# files needed to train +# misc files dataset/ +runs/ +checkpoints/ diff --git a/README.md b/README.md index fb44ec23..b17d404a 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,13 @@ pip install -r requirements.txt ### Training -To be added. +You can invoke the training via the command below. + +```bash +bash ./scripts/train.sh +``` + +You can also modify the arguments in `train.sh` for your own need. ### Inference diff --git a/requirements.txt b/requirements.txt index 1ec00615..638876bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ torchvision datasets transformers av +tensorboard diff --git a/scripts/train.sh b/scripts/train.sh new file mode 100644 index 00000000..bd02dbd2 --- /dev/null +++ b/scripts/train.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +# get args +GPUS=${1:8} + +# get root dir +FOLDER_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +ROOT_DIR=$FOLDER_DIR/.. + +# go to root dir +cd $ROOT_DIR + +# define dataset shards +COLLATED_VIDEO_DIR=./dataset/MSRVTT-collated/val/videos +PROCESSED_DATASET=( + ./dataset/MSRVTT-processed/val/part-00000 + ./dataset/MSRVTT-processed/val/part-00001 + ./dataset/MSRVTT-processed/val/part-00002 + ./dataset/MSRVTT-processed/val/part-00003 + ./dataset/MSRVTT-processed/val/part-00004 + ./dataset/MSRVTT-processed/val/part-00005 + ./dataset/MSRVTT-processed/val/part-00006 + ./dataset/MSRVTT-processed/val/part-00007 + ./dataset/MSRVTT-processed/val/part-00008 + ./dataset/MSRVTT-processed/val/part-00009 +) + +# run single node training +torchrun --standalone \ + --nproc_per_node $GPUS \ + train.py \ + --epochs 1 \ + --batch_size 1 \ + --lr 1e-4 \ + --accumulation_steps 32 \ + --grad_checkpoint \ + --dataset $PROCESSED_DATASET \ + --video_dir $COLLATED_VIDEO_DIR \ + --save_interval 224 \ + --checkpoint_dir ./checkpoints \ + --tensorboard_dir ./runs