diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index f10a54add79158..1bd190d1303e9a 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -212,9 +212,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + logger.info(accelerator.state) # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -384,8 +389,8 @@ def collate_fn(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -467,17 +472,20 @@ def collate_fn(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 21dc568fd44822..3fd67d5fbf66e4 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -249,9 +249,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -486,8 +491,8 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -567,17 +572,20 @@ def group_texts(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index b7b085e5b61bea..80dfcf9a9194e5 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -258,9 +258,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -530,8 +535,8 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -611,17 +616,20 @@ def group_texts(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index a3868434b28504..eeb04e417fdfd6 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -65,7 +65,7 @@ def parse_args(): - parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task") + parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task") parser.add_argument( "--dataset_name", type=str, @@ -284,9 +284,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -483,8 +488,8 @@ def preprocess_function(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -567,17 +572,20 @@ def preprocess_function(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 69ddf24ab5aa49..370dd3f43d9545 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -297,8 +297,16 @@ def main(): send_example_telemetry("run_qa_beam_search_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers + # in the environment + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -739,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -818,17 +826,22 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) + optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index d98dca22bf2e48..6bf4eb28e99418 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -337,9 +337,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -757,8 +762,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -839,17 +844,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 7ffb876d4db58f..30cb7cc53ae318 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -326,9 +326,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() @@ -487,8 +492,8 @@ def preprocess_val(example_batch): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -563,17 +568,20 @@ def preprocess_val(example_batch): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ca9ef6ba9fa241..96781b6dcadbdd 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -330,9 +330,13 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) if args.source_prefix is None and args.model_name_or_path in [ "t5-small", "t5-base", @@ -552,8 +556,8 @@ def postprocess_text(preds, labels): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -635,17 +639,20 @@ def postprocess_text(preds, labels): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1