From 0b16dabf96069af5526ac6216998e1ad2e98f18c Mon Sep 17 00:00:00 2001 From: Joe Date: Mon, 15 Aug 2022 14:03:34 -0700 Subject: [PATCH 1/2] Update run_translation_no_trainer.py found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint --- .../translation/run_translation_no_trainer.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index a6b0988f63d090..ab1974a60ac0c4 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -510,7 +510,7 @@ def preprocess_function(examples): # Optimizer # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] + no_decay = ["bias", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], @@ -607,10 +607,15 @@ def postprocess_text(preds, labels): starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: @@ -619,7 +624,9 @@ def postprocess_text(preds, labels): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: - completed_steps += 1 + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss From 78753018375db3b2786ff1a0df2aba731cdaf717 Mon Sep 17 00:00:00 2001 From: Joe Date: Mon, 15 Aug 2022 14:45:03 -0700 Subject: [PATCH 2/2] fixs `no_decay` and `resume_step` issue 1. change `no_decay` list 2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1` --- .../pytorch/language-modeling/run_clm_no_trainer.py | 13 ++++++++++--- .../pytorch/language-modeling/run_mlm_no_trainer.py | 11 +++++++++-- .../translation/run_translation_no_trainer.py | 2 +- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 3fd67d5fbf66e4..225b88a49440cc 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -464,7 +464,7 @@ def group_texts(examples): # Optimizer # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] + no_decay = ["bias", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], @@ -558,10 +558,15 @@ def group_texts(examples): starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: @@ -570,7 +575,9 @@ def group_texts(examples): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: - completed_steps += 1 + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 continue with accelerator.accumulate(model): diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 80dfcf9a9194e5..c5f6aad4126f5a 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -602,10 +602,15 @@ def group_texts(examples): starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: @@ -614,7 +619,9 @@ def group_texts(examples): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: - completed_steps += 1 + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 continue with accelerator.accumulate(model): diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index ab1974a60ac0c4..34c2ad1964090f 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -510,7 +510,7 @@ def preprocess_function(examples): # Optimizer # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "layer_norm.weight"] + no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],