Skip to content

Commit

Permalink
W&B resume ddp from run link fix (ultralytics#2579)
Browse files Browse the repository at this point in the history
* W&B resume ddp from run link fix

* Native DDP W&B support for training, resuming

(cherry picked from commit 518c095)
  • Loading branch information
AyushExel authored and Lechtr committed Mar 28, 2021
1 parent 5cedb0c commit 7ce5755
Showing 1 changed file with 51 additions and 13 deletions.
64 changes: 51 additions & 13 deletions utils/wandb_logging/wandb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
WANDB_ARTIFACT_PREFIX = 'wandb-artifact://'


def remove_prefix(from_string, prefix):
def remove_prefix(from_string, prefix=WANDB_ARTIFACT_PREFIX):
return from_string[len(prefix):]


Expand All @@ -33,36 +33,74 @@ def check_wandb_config_file(data_config_file):
return wandb_config
return data_config_file

def get_run_info(run_path):
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
run_id = run_path.stem
project = run_path.parent.stem
model_artifact_name = 'run_' + run_id + '_model'
return run_id, project, model_artifact_name

def resume_and_get_id(opt):
# It's more elegant to stick to 1 wandb.init call, but as useful config data is overwritten in the WandbLogger's wandb.init call
def check_wandb_resume(opt):
process_wandb_config_ddp_mode(opt) if opt.global_rank not in [-1, 0] else None
if isinstance(opt.resume, str):
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
run_path = Path(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX))
run_id = run_path.stem
project = run_path.parent.stem
model_artifact_name = WANDB_ARTIFACT_PREFIX + 'run_' + run_id + '_model'
assert wandb, 'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
run = wandb.init(id=run_id, project=project, resume='allow', group=opt.wandb_group,)
opt.resume = model_artifact_name
return run
if opt.global_rank not in [-1, 0]: # For resuming DDP runs
run_id, project, model_artifact_name = get_run_info(opt.resume)
api = wandb.Api()
artifact = api.artifact(project + '/' + model_artifact_name + ':latest')
modeldir = artifact.download()
opt.weights = str(Path(modeldir) / "last.pt")
return True
return None

def process_wandb_config_ddp_mode(opt):
with open(opt.data) as f:
data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict
train_dir, val_dir = None, None
if data_dict['train'].startswith(WANDB_ARTIFACT_PREFIX):
api = wandb.Api()
train_artifact = api.artifact(remove_prefix(data_dict['train']) + ':' + opt.artifact_alias)
train_dir = train_artifact.download()
train_path = Path(train_dir) / 'data/images/'
data_dict['train'] = str(train_path)

if data_dict['val'].startswith(WANDB_ARTIFACT_PREFIX):
api = wandb.Api()
val_artifact = api.artifact(remove_prefix(data_dict['val']) + ':' + opt.artifact_alias)
val_dir = val_artifact.download()
val_path = Path(val_dir) / 'data/images/'
data_dict['val'] = str(val_path)
if train_dir or val_dir:
ddp_data_path = str(Path(val_dir) / 'wandb_local_data.yaml')
with open(ddp_data_path, 'w') as f:
yaml.dump(data_dict, f)
opt.data = ddp_data_path



class WandbLogger():
def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
# Pre-training routine --
self.job_type = job_type
self.wandb, self.wandb_run, self.data_dict = wandb, None if not wandb else wandb.run, data_dict
if self.wandb:
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
if isinstance(opt.resume, str): # checks resume from artifact
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
run_id, project, model_artifact_name = get_run_info(opt.resume)
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
assert wandb, 'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
opt.resume = model_artifact_name
elif self.wandb:
self.wandb_run = wandb.init(config=opt,
resume="allow",
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
name=name,
job_type=job_type,
group=opt.wandb_group,
id=run_id) if not wandb.run else wandb.run
if self.wandb_run:
if self.job_type == 'Training':
if not opt.resume:
wandb_data_dict = self.check_and_upload_dataset(opt) if opt.upload_dataset else data_dict
Expand Down

0 comments on commit 7ce5755

Please sign in to comment.