Skip to content

Commit

Permalink
Minor tqdm fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mariosasko committed Apr 14, 2023
1 parent f9c770b commit dd4ae2e
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 24 deletions.
4 changes: 2 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1437,8 +1437,8 @@ def save_to_disk(
else:
pbar.update(content)
else:
for kwargs in kwargs_per_job:
with pbar:
with pbar:
for kwargs in kwargs_per_job:
for job_id, done, content in Dataset._save_to_disk_single(**kwargs):
if done:
shards_done += 1
Expand Down
46 changes: 24 additions & 22 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1485,13 +1485,14 @@ def _prepare_split(
result = None
gen_kwargs = split_generator.gen_kwargs
job_id = 0
for job_id, done, content in self._prepare_split_single(
gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
):
if done:
result = content
else:
pbar.update(content)
with pbar:
for job_id, done, content in self._prepare_split_single(
gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
):
if done:
result = content
else:
pbar.update(content)
# wrapping everything into lists for consistency with the multiprocessed code path
assert result is not None, "Failed to retrieve results from prepare_split"
examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = [
Expand All @@ -1513,21 +1514,22 @@ def _prepare_split(
shard_lengths_per_job = [None] * num_jobs

with Pool(num_proc) as pool:
for job_id, done, content in iflatmap_unordered(
pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
):
if done:
# the content is the result of the job
(
examples_per_job[job_id],
bytes_per_job[job_id],
features_per_job[job_id],
shards_per_job[job_id],
shard_lengths_per_job[job_id],
) = content
else:
# the content is the number of examples progress update
pbar.update(content)
with pbar:
for job_id, done, content in iflatmap_unordered(
pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
):
if done:
# the content is the result of the job
(
examples_per_job[job_id],
bytes_per_job[job_id],
features_per_job[job_id],
shards_per_job[job_id],
shard_lengths_per_job[job_id],
) = content
else:
# the content is the number of examples progress update
pbar.update(content)

assert (
None not in examples_per_job
Expand Down

1 comment on commit dd4ae2e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006847 / 0.011353 (-0.004506) 0.004571 / 0.011008 (-0.006437) 0.098344 / 0.038508 (0.059836) 0.028440 / 0.023109 (0.005331) 0.302578 / 0.275898 (0.026680) 0.336750 / 0.323480 (0.013270) 0.005054 / 0.007986 (-0.002932) 0.004696 / 0.004328 (0.000368) 0.075651 / 0.004250 (0.071400) 0.037183 / 0.037052 (0.000131) 0.302921 / 0.258489 (0.044432) 0.339277 / 0.293841 (0.045436) 0.032028 / 0.128546 (-0.096518) 0.011515 / 0.075646 (-0.064131) 0.321241 / 0.419271 (-0.098031) 0.043351 / 0.043533 (-0.000182) 0.302014 / 0.255139 (0.046875) 0.323058 / 0.283200 (0.039859) 0.088684 / 0.141683 (-0.052999) 1.484582 / 1.452155 (0.032428) 1.612331 / 1.492716 (0.119614)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.197347 / 0.018006 (0.179341) 0.393029 / 0.000490 (0.392540) 0.003145 / 0.000200 (0.002945) 0.000070 / 0.000054 (0.000015)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023652 / 0.037411 (-0.013760) 0.098215 / 0.014526 (0.083689) 0.105406 / 0.176557 (-0.071151) 0.162978 / 0.737135 (-0.574158) 0.108710 / 0.296338 (-0.187629)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.460309 / 0.215209 (0.245100) 4.591418 / 2.077655 (2.513763) 2.188346 / 1.504120 (0.684226) 1.978555 / 1.541195 (0.437361) 2.032055 / 1.468490 (0.563565) 0.701409 / 4.584777 (-3.883368) 3.417192 / 3.745712 (-0.328520) 1.883558 / 5.269862 (-3.386303) 1.164694 / 4.565676 (-3.400983) 0.083319 / 0.424275 (-0.340956) 0.012322 / 0.007607 (0.004715) 0.568616 / 0.226044 (0.342571) 5.655343 / 2.268929 (3.386415) 2.656102 / 55.444624 (-52.788522) 2.302363 / 6.876477 (-4.574113) 2.426307 / 2.142072 (0.284234) 0.810477 / 4.805227 (-3.994750) 0.151883 / 6.500664 (-6.348781) 0.066858 / 0.075469 (-0.008612)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.180922 / 1.841788 (-0.660865) 13.977827 / 8.074308 (5.903519) 13.901386 / 10.191392 (3.709994) 0.138227 / 0.680424 (-0.542196) 0.016546 / 0.534201 (-0.517655) 0.383267 / 0.579283 (-0.196016) 0.385839 / 0.434364 (-0.048525) 0.449474 / 0.540337 (-0.090864) 0.523730 / 1.386936 (-0.863206)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006863 / 0.011353 (-0.004490) 0.004606 / 0.011008 (-0.006402) 0.080197 / 0.038508 (0.041688) 0.028122 / 0.023109 (0.005013) 0.341054 / 0.275898 (0.065156) 0.386645 / 0.323480 (0.063165) 0.005064 / 0.007986 (-0.002921) 0.003371 / 0.004328 (-0.000958) 0.076249 / 0.004250 (0.071999) 0.037185 / 0.037052 (0.000133) 0.344625 / 0.258489 (0.086136) 0.390804 / 0.293841 (0.096963) 0.032073 / 0.128546 (-0.096473) 0.011628 / 0.075646 (-0.064018) 0.086571 / 0.419271 (-0.332700) 0.041887 / 0.043533 (-0.001645) 0.340737 / 0.255139 (0.085598) 0.366566 / 0.283200 (0.083367) 0.089945 / 0.141683 (-0.051738) 1.473671 / 1.452155 (0.021517) 1.549659 / 1.492716 (0.056943)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.248691 / 0.018006 (0.230684) 0.394705 / 0.000490 (0.394215) 0.009794 / 0.000200 (0.009594) 0.000091 / 0.000054 (0.000036)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025332 / 0.037411 (-0.012079) 0.098070 / 0.014526 (0.083545) 0.107690 / 0.176557 (-0.068866) 0.156154 / 0.737135 (-0.580982) 0.110599 / 0.296338 (-0.185740)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.444817 / 0.215209 (0.229608) 4.469807 / 2.077655 (2.392152) 2.251305 / 1.504120 (0.747185) 2.071914 / 1.541195 (0.530719) 2.119041 / 1.468490 (0.650551) 0.704666 / 4.584777 (-3.880111) 3.384840 / 3.745712 (-0.360873) 2.946550 / 5.269862 (-2.323311) 1.639360 / 4.565676 (-2.926316) 0.083299 / 0.424275 (-0.340976) 0.012399 / 0.007607 (0.004792) 0.546325 / 0.226044 (0.320281) 5.531060 / 2.268929 (3.262131) 2.799030 / 55.444624 (-52.645594) 2.532299 / 6.876477 (-4.344178) 2.587062 / 2.142072 (0.444989) 0.810554 / 4.805227 (-3.994673) 0.154144 / 6.500664 (-6.346520) 0.067558 / 0.075469 (-0.007911)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.343644 / 1.841788 (-0.498143) 14.109821 / 8.074308 (6.035513) 14.016668 / 10.191392 (3.825276) 0.141461 / 0.680424 (-0.538963) 0.016406 / 0.534201 (-0.517795) 0.378409 / 0.579283 (-0.200875) 0.378449 / 0.434364 (-0.055915) 0.439984 / 0.540337 (-0.100354) 0.521736 / 1.386936 (-0.865200)

Please sign in to comment.