Skip to content

Commit

Permalink
Style fix
Browse files Browse the repository at this point in the history
  • Loading branch information
mariosasko committed Mar 4, 2022
1 parent 5f9750f commit a637d64
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions src/datasets/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@ def _release(self):
self._lock_file_fd = None


# lists - summarize long lists similarly to NumPy
# arrays/tensors - let the frameworks control formatting
def summarize_if_long_list(obj):
if not type(obj) == list or len(obj) <= 6:
return f"{obj}"

def format_chunk(chunk):
return ", ".join(repr(x) for x in chunk)

return f"[{format_chunk(obj[:3])}, ..., {format_chunk(obj[-3:])}]"


class MetricInfoMixin:
"""This base class exposes some attributes of MetricInfo
at the base level of the Metric for easy access.
Expand Down Expand Up @@ -454,18 +466,6 @@ def add_batch(self, *, predictions=None, references=None, **kwargs):
try:
self.writer.write_batch(batch)
except pa.ArrowInvalid:

# lists - summarize long lists similarly to NumPy
# arrays/tensors - let the frameworks control formatting
def summarize_if_long_list(obj):
if not type(obj) == list or len(obj) <= 6:
return f"{obj}"

def format_chunk(chunk):
return ", ".join(repr(x) for x in chunk)

return f"[{format_chunk(obj[:3])}, ..., {format_chunk(obj[-3:])}]"

if any(len(batch[c]) != len(next(iter(batch.values()))) for c in batch):
col0 = next(iter(batch))
bad_col = [c for c in batch if len(batch[c]) != len(batch[col0])][0]
Expand Down

1 comment on commit a637d64

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==5.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.012150 / 0.011353 (0.000797) 0.004442 / 0.011008 (-0.006566) 0.036702 / 0.038508 (-0.001806) 0.042562 / 0.023109 (0.019452) 0.358816 / 0.275898 (0.082918) 0.406567 / 0.323480 (0.083087) 0.009430 / 0.007986 (0.001445) 0.005572 / 0.004328 (0.001244) 0.010821 / 0.004250 (0.006571) 0.050967 / 0.037052 (0.013914) 0.347998 / 0.258489 (0.089509) 0.408949 / 0.293841 (0.115108) 0.037202 / 0.128546 (-0.091344) 0.011206 / 0.075646 (-0.064440) 0.304414 / 0.419271 (-0.114858) 0.061466 / 0.043533 (0.017933) 0.341583 / 0.255139 (0.086444) 0.408338 / 0.283200 (0.125138) 0.123595 / 0.141683 (-0.018088) 2.088718 / 1.452155 (0.636563) 2.209649 / 1.492716 (0.716932)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.347096 / 0.018006 (0.329090) 0.492375 / 0.000490 (0.491886) 0.023529 / 0.000200 (0.023329) 0.000418 / 0.000054 (0.000364)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.031842 / 0.037411 (-0.005569) 0.125060 / 0.014526 (0.110534) 0.135800 / 0.176557 (-0.040757) 0.181193 / 0.737135 (-0.555942) 0.133953 / 0.296338 (-0.162386)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.485949 / 0.215209 (0.270740) 4.821616 / 2.077655 (2.743961) 2.064382 / 1.504120 (0.560262) 1.830594 / 1.541195 (0.289399) 1.908499 / 1.468490 (0.440009) 0.511247 / 4.584777 (-4.073530) 5.447415 / 3.745712 (1.701703) 4.195853 / 5.269862 (-1.074009) 1.056927 / 4.565676 (-3.508750) 0.063379 / 0.424275 (-0.360896) 0.014631 / 0.007607 (0.007024) 0.643238 / 0.226044 (0.417194) 6.042415 / 2.268929 (3.773486) 2.630944 / 55.444624 (-52.813681) 2.195633 / 6.876477 (-4.680844) 2.306497 / 2.142072 (0.164425) 0.635390 / 4.805227 (-4.169837) 0.142578 / 6.500664 (-6.358086) 0.075013 / 0.075469 (-0.000456)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.898597 / 1.841788 (0.056809) 16.491956 / 8.074308 (8.417648) 31.082313 / 10.191392 (20.890921) 0.995017 / 0.680424 (0.314593) 0.620794 / 0.534201 (0.086593) 0.587584 / 0.579283 (0.008301) 0.617819 / 0.434364 (0.183455) 0.379383 / 0.540337 (-0.160955) 0.390426 / 1.386936 (-0.996510)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009907 / 0.011353 (-0.001446) 0.004731 / 0.011008 (-0.006278) 0.035270 / 0.038508 (-0.003238) 0.039450 / 0.023109 (0.016340) 0.376196 / 0.275898 (0.100298) 0.402360 / 0.323480 (0.078880) 0.007453 / 0.007986 (-0.000533) 0.004091 / 0.004328 (-0.000237) 0.008628 / 0.004250 (0.004377) 0.046435 / 0.037052 (0.009383) 0.349978 / 0.258489 (0.091489) 0.402433 / 0.293841 (0.108592) 0.036100 / 0.128546 (-0.092446) 0.011181 / 0.075646 (-0.064465) 0.294757 / 0.419271 (-0.124514) 0.062067 / 0.043533 (0.018534) 0.358933 / 0.255139 (0.103794) 0.401735 / 0.283200 (0.118535) 0.111647 / 0.141683 (-0.030035) 2.227405 / 1.452155 (0.775250) 2.281173 / 1.492716 (0.788457)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.278287 / 0.018006 (0.260280) 0.485805 / 0.000490 (0.485315) 0.002492 / 0.000200 (0.002292) 0.000091 / 0.000054 (0.000036)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.028972 / 0.037411 (-0.008439) 0.122245 / 0.014526 (0.107719) 0.132162 / 0.176557 (-0.044395) 0.178980 / 0.737135 (-0.558156) 0.134122 / 0.296338 (-0.162217)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.493890 / 0.215209 (0.278680) 4.900391 / 2.077655 (2.822736) 2.123402 / 1.504120 (0.619282) 1.874035 / 1.541195 (0.332840) 1.949943 / 1.468490 (0.481453) 0.517519 / 4.584777 (-4.067258) 5.653968 / 3.745712 (1.908256) 2.468894 / 5.269862 (-2.800967) 1.088265 / 4.565676 (-3.477412) 0.062865 / 0.424275 (-0.361410) 0.014218 / 0.007607 (0.006611) 0.613051 / 0.226044 (0.387007) 6.077693 / 2.268929 (3.808764) 2.636902 / 55.444624 (-52.807722) 2.195011 / 6.876477 (-4.681466) 2.299232 / 2.142072 (0.157160) 0.660183 / 4.805227 (-4.145044) 0.147453 / 6.500664 (-6.353211) 0.074786 / 0.075469 (-0.000683)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.910560 / 1.841788 (0.068773) 16.393262 / 8.074308 (8.318954) 31.325206 / 10.191392 (21.133814) 1.050514 / 0.680424 (0.370090) 0.646826 / 0.534201 (0.112625) 0.570572 / 0.579283 (-0.008712) 0.620820 / 0.434364 (0.186456) 0.379498 / 0.540337 (-0.160839) 0.399377 / 1.386936 (-0.987559)

CML watermark

Please sign in to comment.