Skip to content

Commit

Permalink
Add logging of units to dllogger metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
jan-golda authored and nv-kkudrynski committed Jun 27, 2022
1 parent 048370a commit e372b7b
Show file tree
Hide file tree
Showing 57 changed files with 382 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def __init__(self, logger: Optional[Logger] = None):
self.logger = logger
self.scheduler = None

self.logger.log_metadata('learning rate', {'unit': None})

@abstractmethod
def get_scheduler(self, optimizer, args):
pass
Expand Down Expand Up @@ -93,6 +95,9 @@ def __init__(self, logger, targets_std, prefix=''):
self.prefix = prefix
self.best_mae = float('inf')

self.logger.log_metadata(f'{self.prefix} MAE', {'unit': None})
self.logger.log_metadata(f'{self.prefix} best MAE', {'unit': None})

def on_validation_step(self, input, target, pred):
self.mae(pred.detach(), target.detach())

Expand Down Expand Up @@ -126,6 +131,12 @@ def __init__(self, logger, batch_size: int, warmup_epochs: int = 1, mode: str =
self.mode = mode
self.logger = logger

logger.log_metadata(f"throughput_{self.mode}", {'unit': 'molecules/s'})
logger.log_metadata(f"total_time_{self.mode}", {'unit': 's'})
logger.log_metadata(f"latency_{self.mode}_mean", {'unit': 's'})
for level in [90, 95, 99]:
logger.log_metadata(f"latency_{self.mode}_{level}", {'unit': 's'})

def on_batch_start(self):
if self.epoch >= self.warmup_epochs:
self.timestamps.append(time.time() * 1000.0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ class Logger(ABC):
def log_hyperparams(self, params):
pass

@rank_zero_only
@abstractmethod
def log_metadata(self, metric, metadata):
pass

@rank_zero_only
@abstractmethod
def log_metrics(self, metrics, step=None):
Expand Down Expand Up @@ -81,6 +86,11 @@ def log_hyperparams(self, params):
for logger in self.loggers:
logger.log_hyperparams(params)

@rank_zero_only
def log_metadata(self, metric, metadata):
for logger in self.loggers:
logger.log_metadata(metric, metadata)


class DLLogger(Logger):
def __init__(self, save_dir: pathlib.Path, filename: str):
Expand All @@ -95,6 +105,10 @@ def log_hyperparams(self, params):
params = self._sanitize_params(params)
dllogger.log(step="PARAMETER", data=params)

@rank_zero_only
def log_metadata(self, metric, metadata):
dllogger.metadata(metric, metadata)

@rank_zero_only
def log_metrics(self, metrics, step=None):
if step is None:
Expand Down Expand Up @@ -126,6 +140,10 @@ def log_hyperparams(self, params: Dict[str, Any]) -> None:
params = self._sanitize_params(params)
self.experiment.config.update(params, allow_val_change=True)

@rank_zero_only
def log_metadata(self, metric, metadata):
pass

@rank_zero_only
def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
if step is not None:
Expand Down
8 changes: 8 additions & 0 deletions MxNet/Classification/RN50v1.5/log_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,11 @@ def setup_logging(args):
])
else:
dllogger.init([])

dllogger.metadata("val.accuracy", {"unit": None})
dllogger.metadata("val.top_k_accuracy_5", {"unit": None})
dllogger.metadata("train.ips", {"unit": "images/s"})
dllogger.metadata("val.ips", {"unit": "images/s"})
dllogger.metadata("val.latency_50", {"unit": "s"})
dllogger.metadata("val.latency_90", {"unit": "s"})
dllogger.metadata("val.latency_avg", {"unit": "s"})
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,10 @@ def epoch_generator_wrapper(self, gen):

class Metrics:
ACC_METADATA = {"unit": "%", "format": ":.2f"}
IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
IPS_METADATA = {"unit": "images/s", "format": ":.2f"}
TIME_METADATA = {"unit": "s", "format": ":.5f"}
LOSS_METADATA = {"format": ":.5f"}
LR_METADATA = {"format": ":.5f"}
LOSS_METADATA = {"unit": None, "format": ":.5f"}
LR_METADATA = {"unit": None, "format": ":.5f"}

def __init__(self, logger):
self.logger = logger
Expand Down
5 changes: 5 additions & 0 deletions PyTorch/Classification/GPUNet/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,11 @@ def main():
)
else:
dllogger.init(backends=[])

dllogger.metadata("top1", {"unit": None})
dllogger.metadata("top5", {"unit": None})
dllogger.metadata("average_ips", {"unit": "images/s"})

data_config = resolve_data_config(
vars(args), model=model, verbose=args.local_rank == 0
)
Expand Down
7 changes: 7 additions & 0 deletions PyTorch/Detection/Efficientdet/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,13 @@ def main():
assert args.rank >= 0

setup_dllogger(args.rank, filename=args.dllogger_file)
dllogger.metadata('eval_batch_time', {'unit': 's'})
dllogger.metadata('train_batch_time', {'unit': 's'})
dllogger.metadata('eval_throughput', {'unit': 'images/s'})
dllogger.metadata('train_throughout', {'unit': 'images/s'})
dllogger.metadata('eval_loss', {'unit': None})
dllogger.metadata('train_loss', {'unit': None})
dllogger.metadata('map', {'unit': None})

if args.distributed:
logging.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
Expand Down
5 changes: 5 additions & 0 deletions PyTorch/Detection/Efficientdet/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ def add_bool_arg(parser, name, default=False, help=''): # FIXME move to utils

def validate(args):
setup_dllogger(0, filename=args.dllogger_file)
dllogger.metadata('total_inference_time', {'unit': 's'})
dllogger.metadata('inference_throughput', {'unit': 'images/s'})
dllogger.metadata('inference_time', {'unit': 's'})
dllogger.metadata('map', {'unit': None})
dllogger.metadata('total_eval_time', {'unit': 's'})

if args.checkpoint != '':
args.pretrained = True
Expand Down
6 changes: 6 additions & 0 deletions PyTorch/Detection/SSD/ssd/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def __init__(self, name, json_output=None, log_interval=20):
backends.append(DLLogger.JSONStreamBackend(DLLogger.Verbosity.VERBOSE, json_output))

DLLogger.init(backends)
DLLogger.metadata("mAP", {"unit": None})

self.epoch = 0
self.train_iter = 0
Expand Down Expand Up @@ -137,6 +138,11 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.images_per_ses = BenchmarkMeter(self.name)

DLLogger.metadata("avg_img/sec", {"unit": "images/s"})
DLLogger.metadata("med_img/sec", {"unit": "images/s"})
DLLogger.metadata("min_img/sec", {"unit": "images/s"})
DLLogger.metadata("max_img/sec", {"unit": "images/s"})

def update(self, bs, time):
self.images_per_ses.update(bs, time)

Expand Down
37 changes: 20 additions & 17 deletions PyTorch/Forecasting/TFT/log_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,23 +101,26 @@ def step_format(step):
container_setup_info = {**get_framework_env_vars(), **get_system_info()}
dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)

dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f', 'unit': 'items/s'})
dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f', 'unit': None})
dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f', 'unit': None})
dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f', 'unit': None})
dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f', 'unit': None})
dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f', 'unit': 'items/s'})
dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
dllogger.metadata('sum', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f', 'unit': 'items/s'})
dllogger.metadata('latency_avg', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
dllogger.metadata('average_ips', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f', 'unit': 'items/s'})


def get_framework_env_vars():
Expand Down
3 changes: 3 additions & 0 deletions PyTorch/LanguageModeling/BART/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,9 @@ def main(args, model=None) -> SummarizationModule:
else:
dllogger.init(backends=[])

dllogger.metadata("avg_train_time", {"unit": "s"})
dllogger.metadata("avg_train_throughput", {"unit": "tokens/s"})

main(args)

dllogger.flush()
4 changes: 4 additions & 0 deletions PyTorch/LanguageModeling/BART/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,10 @@ def run_generate(verbose=True):
else:
dllogger.init(backends=[])

dllogger.metadata("inference_throughput_mean", {"unit": "tokens/s"})
for suffix in ["mean", "conf_50", "conf_90", "conf_95", "conf_99", "conf_100"]:
dllogger.metadata(f"inference_latency_{suffix}", {"unit": "s"})

if parsed_args and verbose:
print(f"parsed the following generate kwargs: {parsed_args}")

Expand Down
7 changes: 7 additions & 0 deletions PyTorch/LanguageModeling/BERT/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,13 @@ def main(args):
else:
dllogger.init(backends=[])

dllogger.metadata("e2e_train_time", {"unit": "s"})
dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
dllogger.metadata("e2e_inference_time", {"unit": "s"})
dllogger.metadata("inference_sequences_per_second", {"unit": "sequences/s"})
dllogger.metadata("exact_match", {"unit": None})
dllogger.metadata("F1", {"unit": None})

dllogger.log(step="PARAMETER", data={"Config": [str(args)]})

if args.gradient_accumulation_steps < 1:
Expand Down
5 changes: 5 additions & 0 deletions PyTorch/LanguageModeling/BERT/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,11 @@ def setup_training(args):
else:
dllogger.init(backends=[])

dllogger.metadata("e2e_train_time", {"unit": "s"})
dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
dllogger.metadata("final_loss", {"unit": None})
dllogger.metadata("raw_train_time", {"unit": "s"})

print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, args.n_gpu, bool(args.local_rank != -1), args.fp16))

Expand Down
8 changes: 8 additions & 0 deletions PyTorch/LanguageModeling/BERT/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,14 @@ def main():
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])

dllogger.metadata("e2e_train_time", {"unit": "s"})
dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
dllogger.metadata("final_loss", {"unit": None})
dllogger.metadata("e2e_inference_time", {"unit": "s"})
dllogger.metadata("inference_sequences_per_second", {"unit": "sequences/s"})
dllogger.metadata("exact_match", {"unit": None})
dllogger.metadata("F1", {"unit": None})

print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
Expand Down
8 changes: 8 additions & 0 deletions PyTorch/LanguageModeling/Transformer-XL/pytorch/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,14 @@ def main():
logging.info(args)
dllogger.log(step='PARAMETER', data=vars(args))

dllogger.metadata('eval_throughput', {'unit': 'tokens/s'})
dllogger.metadata('eval_loss', {'unit': None})
dllogger.metadata('eval_perplexity', {'unit': None})
dllogger.metadata('eval_latency', {'unit': 'ms'})
dllogger.metadata('eval_avg_latency', {'unit': 'ms'})
for p in args.percentiles:
dllogger.metadata(f'eval_{p}%_latency', {'unit': 'ms'})

if not args.no_env:
log_env_info()

Expand Down
8 changes: 8 additions & 0 deletions PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,14 @@ def main():
logging.info(args)
dllogger.log(step='PARAMETER', data=vars(args))

dllogger.metadata('train_throughput', {'unit': 'tokens/s'})
dllogger.metadata('train_elapsed', {'unit': 'min'})
dllogger.metadata('valid_elapsed', {'unit': 'min'})
dllogger.metadata('train_perplexity', {'unit': None})
dllogger.metadata('valid_perplexity', {'unit': None})
dllogger.metadata('train_loss', {'unit': None})
dllogger.metadata('valid_loss', {'unit': None})

logging.info(f'world size: {utils.distributed.get_world_size()}')

if not args.no_env:
Expand Down
9 changes: 9 additions & 0 deletions PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,15 @@ def init_logging(log_path):

dllogger.init(backends=[json_backend, stdout_backend])

dllogger.metadata("best_auc", {"unit": None})
dllogger.metadata("mean_inference_latency_batch_1", {"unit": "s"})
dllogger.metadata("mean_inference_latency_batch_64", {"unit": "s"})
dllogger.metadata("mean_inference_latency_batch_4096", {"unit": "s"})
dllogger.metadata("average_train_throughput", {"unit": "samples/s"})
dllogger.metadata("mean_inference_throughput_batch_1", {"unit": "samples/s"})
dllogger.metadata("mean_inference_throughput_batch_64", {"unit": "samples/s"})
dllogger.metadata("mean_inference_throughput_batch_4096", {"unit": "samples/s"})


class StepTimer():
def __init__(self):
Expand Down
7 changes: 7 additions & 0 deletions PyTorch/Recommendation/NCF/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def main():

dllogger.log(data=vars(args), step='PARAMETER')



model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
mlp_layer_sizes=args.layers, dropout=args.dropout)

Expand Down Expand Up @@ -100,6 +102,11 @@ def main():
result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 95)
result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 99)

for batch_size in batch_sizes:
dllogger.metadata(f'batch_{batch_size}_mean_throughput', {'unit': 'samples/s'})
for p in ['mean', 'p90', 'p95', 'p99']:
dllogger.metadata(f'batch_{batch_size}_{p}_latency', {'unit': 's'})

dllogger.log(data=result_data, step=tuple())
dllogger.flush()
return
Expand Down
18 changes: 13 additions & 5 deletions PyTorch/Recommendation/NCF/ncf.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,19 @@ def main():
else:
dllogger.init(backends=[])

dllogger.metadata('train_throughput', {"name": 'train_throughput', 'format': ":.3e"})
dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"})
dllogger.metadata('train_epoch_time', {"name": 'train_epoch_time', 'format': ":.3f"})
dllogger.metadata('validation_epoch_time', {"name": 'validation_epoch_time', 'format': ":.3f"})
dllogger.metadata('eval_throughput', {"name": 'eval_throughput', 'format': ":.3e"})
dllogger.metadata('train_throughput', {"name": 'train_throughput', 'unit': 'samples/s', 'format': ":.3e"})
dllogger.metadata('best_train_throughput', {'unit': 'samples/s'})
dllogger.metadata('mean_train_throughput', {'unit': 'samples/s'})
dllogger.metadata('eval_throughput', {"name": 'eval_throughput', 'unit': 'samples/s', 'format': ":.3e"})
dllogger.metadata('best_eval_throughput', {'unit': 'samples/s'})
dllogger.metadata('mean_eval_throughput', {'unit': 'samples/s'})
dllogger.metadata('train_epoch_time', {"name": 'train_epoch_time', 'unit': 's', 'format': ":.3f"})
dllogger.metadata('validation_epoch_time', {"name": 'validation_epoch_time', 'unit': 's', 'format': ":.3f"})
dllogger.metadata('time_to_target', {'unit': 's'})
dllogger.metadata('time_to_best_model', {'unit': 's'})
dllogger.metadata('hr@10', {"name": 'hr@10', 'unit': None, 'format': ":.5f"})
dllogger.metadata('best_accuracy', {'unit': None})
dllogger.metadata('best_epoch', {'unit': None})

dllogger.log(data=vars(args), step='PARAMETER')

Expand Down
8 changes: 8 additions & 0 deletions PyTorch/Segmentation/MaskRCNN/pytorch/tools/test_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ def main():
else:
dllogger.init(backends=[])

dllogger.metadata("BBOX_mAP", {"unit": None})
dllogger.metadata("MASK_mAP", {"unit": None})
dllogger.metadata("e2e_infer_time", {"unit": "s"})
dllogger.metadata("inference_perf_fps", {"unit": "images/s"})
dllogger.metadata("latency_avg", {"unit": "s"})
dllogger.metadata("latency_90", {"unit": "s"})
dllogger.metadata("latency_95", {"unit": "s"})
dllogger.metadata("latency_99", {"unit": "s"})

save_dir = ""
dllogger.log(step="PARAMETER", data={"config":cfg})
Expand Down
5 changes: 5 additions & 0 deletions PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ def main():
else:
dllogger.init(backends=[])

dllogger.metadata("BBOX_mAP", {"unit": None})
dllogger.metadata("MASK_mAP", {"unit": None})
dllogger.metadata("e2e_train_time", {"unit": "s"})
dllogger.metadata("train_perf_fps", {"unit": "images/s"})

dllogger.log(step="PARAMETER", data={"gpu_count":num_gpus})
# dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()})
dllogger.log(step="PARAMETER", data={"config_file": args.config_file})
Expand Down
Loading

0 comments on commit e372b7b

Please sign in to comment.