Skip to content

Commit

Permalink
Merge branch 'master' into try-to-make-tests-run-faster
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Apr 29, 2021
2 parents 21cd684 + 74f87cc commit c604d37
Show file tree
Hide file tree
Showing 23 changed files with 272 additions and 146 deletions.
4 changes: 2 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- run: source venv/bin/activate
- run: pip install .[tests]
- run: pip install -r additional-tests-requirements.txt --no-deps
- run: pip install pyarrow --upgrade
- run: pip install pyarrow==3.0.0
- run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/

run_dataset_script_tests_pyarrow_1:
Expand Down Expand Up @@ -47,7 +47,7 @@ jobs:
- run: "& venv/Scripts/activate.ps1"
- run: pip install .[tests]
- run: pip install -r additional-tests-requirements.txt --no-deps
- run: pip install pyarrow --upgrade
- run: pip install pyarrow==3.0.0
- run: $env:HF_SCRIPTS_VERSION="master"
- run: python -m pytest -sv ./tests/

Expand Down
2 changes: 1 addition & 1 deletion datasets/newsph_nli/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}
Binary file modified datasets/newsph_nli/dummy/1.0.0/dummy_data.zip
Binary file not shown.
20 changes: 11 additions & 9 deletions datasets/newsph_nli/newsph_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,24 @@


_DESCRIPTION = """\
First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.
First benchmark dataset for sentence entailment in the low-resource Filipino language.
Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,
in 70-15-15 split for training, validation, and testing.
"""

_CITATION = """\
@article{cruz2020investigating,
title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
journal={arXiv preprint arXiv:2010.11574},
year={2020}
}
@article{cruz2020investigating,
title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
journal={arXiv preprint arXiv:2010.11574},
year={2020}
}
"""

_HOMEPAGE = "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks"

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_LICENSE = "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0"

_URL = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip"

Expand Down Expand Up @@ -68,7 +70,7 @@ def _split_generators(self, dl_manager):
data_dir = dl_manager.download_and_extract(_URL)
download_path = os.path.join(data_dir, "newsph-nli")
train_path = os.path.join(download_path, "train.csv")
test_path = os.path.join(download_path, "train.csv")
test_path = os.path.join(download_path, "test.csv")
validation_path = os.path.join(download_path, "valid.csv")

return [
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
"numpy>=1.17",
# Backend and serialization.
# Minimum 1.0.0 to avoid permission errors on windows when using the compute layer on memory mapped data
"pyarrow>=1.0.0",
"pyarrow>=1.0.0<4.0.0",
# For smart caching dataset processing
"dill",
# For performance gains with apache arrow
Expand Down
84 changes: 70 additions & 14 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
update_fingerprint,
)
from .formatting import format_table, get_format_type_from_alias, get_formatter, query_table
from .info import DATASET_INFO_FILENAME, DatasetInfo
from .info import DatasetInfo
from .search import IndexableMixin
from .splits import NamedSplit
from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables, list_table_cache_files
Expand Down Expand Up @@ -616,7 +616,9 @@ def save_to_disk(self, dataset_path: str, fs=None):
Path(dataset_path, config.DATASET_STATE_JSON_FILENAME).as_posix(), "w", encoding="utf-8"
) as state_file:
json.dump(state, state_file, indent=2, sort_keys=True)
with fs.open(Path(dataset_path, DATASET_INFO_FILENAME).as_posix(), "w", encoding="utf-8") as dataset_info_file:
with fs.open(
Path(dataset_path, config.DATASET_INFO_FILENAME).as_posix(), "w", encoding="utf-8"
) as dataset_info_file:
json.dump(dataset_info, dataset_info_file, indent=2, sort_keys=True)
logger.info("Dataset saved in {}".format(dataset_path))

Expand Down Expand Up @@ -653,7 +655,9 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
Path(dataset_path, config.DATASET_STATE_JSON_FILENAME).as_posix(), "r", encoding="utf-8"
) as state_file:
state = json.load(state_file)
with open(Path(dataset_path, DATASET_INFO_FILENAME).as_posix(), "r", encoding="utf-8") as dataset_info_file:
with open(
Path(dataset_path, config.DATASET_INFO_FILENAME).as_posix(), "r", encoding="utf-8"
) as dataset_info_file:
dataset_info = DatasetInfo.from_dict(json.load(dataset_info_file))

dataset_size = estimate_dataset_size(
Expand Down Expand Up @@ -771,6 +775,7 @@ def class_encode_column(self, column: str) -> "Dataset":
class_names = sorted(dset.unique(column))
dst_feat = ClassLabel(names=class_names)
dset = dset.map(lambda batch: {column: dst_feat.str2int(batch)}, input_columns=column, batched=True)
dset = concatenate_datasets([self.remove_columns([column]), dset], axis=1)

new_features = copy.deepcopy(dset.features)
new_features[column] = dst_feat
Expand Down Expand Up @@ -838,7 +843,7 @@ def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
else:
break
dataset.info.features = Features.from_arrow_schema(dataset._data.schema)
self._data = update_metadata_with_features(self._data, self.features)
dataset._data = update_metadata_with_features(dataset._data, dataset.features)
logger.info(
"Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
)
Expand Down Expand Up @@ -1018,7 +1023,7 @@ def remove_columns(self, column_names: Union[str, List[str]], new_fingerprint) -
del dataset._info.features[column_name]

dataset._data = dataset._data.drop(column_names)
dataset._data = update_metadata_with_features(dataset._data, self.features)
dataset._data = update_metadata_with_features(dataset._data, dataset.features)
dataset._fingerprint = new_fingerprint
return dataset

Expand Down Expand Up @@ -1109,7 +1114,7 @@ def rename(columns):
)

dataset._data = dataset._data.rename_columns(new_column_names)
dataset._data = update_metadata_with_features(dataset._data, self.features)
dataset._data = update_metadata_with_features(dataset._data, dataset.features)
dataset._fingerprint = new_fingerprint
return dataset

Expand Down Expand Up @@ -1771,8 +1776,18 @@ def init_buffer_and_writer():
# Optionally initialize the writer as a context manager
with contextlib.ExitStack() as stack:
try:
# Only load the columns we actually need
if input_columns:
input_dataset = self.with_format(
self._format_type, columns=input_columns, output_all_columns=False, **self._format_kwargs
)
if remove_columns:
remove_columns = list(set(remove_columns) & set(input_columns))
else:
input_dataset = self

# Loop over single examples or batches and write to buffer/file if examples are to be updated
pbar_iterable = self if not batched else range(0, len(self), batch_size)
pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size)
pbar_unit = "ex" if not batched else "ba"
pbar_desc = "#" + str(rank) if rank is not None else None
pbar = tqdm(pbar_iterable, disable=not_verbose, position=rank, unit=pbar_unit, desc=pbar_desc)
Expand All @@ -1790,13 +1805,18 @@ def init_buffer_and_writer():
writer.write(example)
else:
for i in pbar:
if drop_last_batch and i + batch_size > self.num_rows:
if drop_last_batch and i + batch_size > input_dataset.num_rows:
continue
batch = self[i : i + batch_size]
indices = list(range(*(slice(i, i + batch_size).indices(self.num_rows)))) # Something simpler?
batch = input_dataset[i : i + batch_size]
indices = list(
range(*(slice(i, i + batch_size).indices(input_dataset.num_rows)))
) # Something simpler?
try:
batch = apply_function_on_filtered_inputs(
batch, indices, check_same_num_examples=len(self.list_indexes()) > 0, offset=offset
batch,
indices,
check_same_num_examples=len(input_dataset.list_indexes()) > 0,
offset=offset,
)
except NumExamplesMismatch:
raise DatasetTransformationNotAllowedError(
Expand Down Expand Up @@ -2633,6 +2653,28 @@ def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Un
for offset in range(0, len(self), batch_size)
)

def to_json(
self,
path_or_buf: Union[PathLike, BinaryIO],
batch_size: Optional[int] = None,
**to_json_kwargs,
) -> int:
"""Exports the dataset to JSON.
Args:
path_or_buf (``PathLike`` or ``FileOrBuffer``): Either a path to a file or a BinaryIO.
batch_size (Optional ``int``): Size of the batch to load in memory and write at once.
Defaults to :obj:`datasets.config.DEFAULT_MAX_BATCH_SIZE`.
to_json_kwargs: Parameters to pass to pandas's :func:`pandas.DataFrame.to_json`
Returns:
int: The number of characters or bytes written
"""
# Dynamic import to avoid circular dependency
from .io.json import JsonDatasetWriter

return JsonDatasetWriter(self, path_or_buf, batch_size=batch_size, **to_json_kwargs).write()

def to_pandas(
self, batch_size: Optional[int] = None, batched: bool = False
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
Expand Down Expand Up @@ -2858,10 +2900,12 @@ def add_elasticsearch_index(
)
return self

def add_item(self, item: dict):
@transmit_format
@fingerprint_transform(inplace=False)
def add_item(self, item: dict, new_fingerprint: str):
"""Add item to Dataset.
.. versionadded:: 1.6
.. versionadded:: 1.7
Args:
item (dict): Item data to be added.
Expand All @@ -2875,7 +2919,19 @@ def add_item(self, item: dict):
item_table = item_table.cast(schema)
# Concatenate tables
table = concat_tables([self._data, item_table])
return Dataset(table)
if self._indices is None:
indices_table = None
else:
item_indices_array = pa.array([len(self._data)], type=pa.uint64())
item_indices_table = InMemoryTable.from_arrays([item_indices_array], names=["indices"])
indices_table = concat_tables([self._indices, item_indices_table])
return Dataset(
table,
info=copy.deepcopy(self.info),
split=self.split,
indices_table=indices_table,
fingerprint=new_fingerprint,
)


def concatenate_datasets(
Expand Down
27 changes: 7 additions & 20 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,7 @@
from .arrow_writer import ArrowWriter, BeamWriter
from .dataset_dict import DatasetDict
from .fingerprint import Hasher
from .info import (
DATASET_INFO_FILENAME,
DATASET_INFOS_DICT_FILE_NAME,
LICENSE_FILENAME,
DatasetInfo,
DatasetInfosDict,
PostProcessedInfo,
)
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
from .naming import camelcase_to_snakecase, filename_prefix_for_split
from .splits import Split, SplitDict, SplitGenerator
from .utils.download_manager import DownloadManager, GenerateMode
Expand All @@ -55,12 +48,6 @@

logger = get_logger(__name__)

FORCE_REDOWNLOAD = GenerateMode.FORCE_REDOWNLOAD
REUSE_CACHE_IF_EXISTS = GenerateMode.REUSE_CACHE_IF_EXISTS
REUSE_DATASET_IF_EXISTS = GenerateMode.REUSE_DATASET_IF_EXISTS

MAX_DIRECTORY_NAME_LENGTH = 255


class InvalidConfigName(ValueError):
pass
Expand Down Expand Up @@ -175,7 +162,7 @@ def create_config_id(self, config_kwargs: dict, custom_features: Optional[Featur

if suffix:
config_id = self.name + "-" + suffix
if len(config_id) > MAX_DIRECTORY_NAME_LENGTH:
if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:
config_id = self.name + "-" + Hasher.hash(suffix)
return config_id
else:
Expand Down Expand Up @@ -297,7 +284,7 @@ def manual_download_instructions(self) -> Optional[str]:
@classmethod
def get_all_exported_dataset_infos(cls) -> dict:
"""Empty dict if doesn't exist"""
dset_infos_file_path = os.path.join(cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME)
dset_infos_file_path = os.path.join(cls.get_imported_module_dir(), config.DATASETDICT_INFOS_FILENAME)
if os.path.exists(dset_infos_file_path):
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
return {}
Expand Down Expand Up @@ -496,7 +483,7 @@ def download_and_prepare(
if download_config is None:
download_config = DownloadConfig(
cache_dir=os.path.join(self._cache_dir_root, "downloads"),
force_download=bool(download_mode == FORCE_REDOWNLOAD),
force_download=bool(download_mode == GenerateMode.FORCE_REDOWNLOAD),
use_etag=False,
use_auth_token=use_auth_token,
) # We don't use etag for data files to speed up the process
Expand All @@ -515,7 +502,7 @@ def download_and_prepare(
lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock")
with FileLock(lock_path):
data_exists = os.path.exists(self._cache_dir)
if data_exists and download_mode == REUSE_DATASET_IF_EXISTS:
if data_exists and download_mode == GenerateMode.REUSE_DATASET_IF_EXISTS:
logger.warning("Reusing dataset %s (%s)", self.name, self._cache_dir)
# We need to update the info in case some splits were added in the meantime
# for example when calling load_dataset from multiple workers.
Expand Down Expand Up @@ -1174,9 +1161,9 @@ def _save_info(self):
import apache_beam as beam

fs = beam.io.filesystems.FileSystems
with fs.create(os.path.join(self._cache_dir, DATASET_INFO_FILENAME)) as f:
with fs.create(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)) as f:
self.info._dump_info(f)
with fs.create(os.path.join(self._cache_dir, LICENSE_FILENAME)) as f:
with fs.create(os.path.join(self._cache_dir, config.LICENSE_FILENAME)) as f:
self.info._dump_license(f)

def _prepare_split(self, split_generator, pipeline):
Expand Down
Loading

1 comment on commit c604d37

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==1.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.023074 / 0.011353 (0.011721) 0.015821 / 0.011008 (0.004813) 0.050749 / 0.038508 (0.012241) 0.042053 / 0.023109 (0.018944) 0.400631 / 0.275898 (0.124733) 0.436141 / 0.323480 (0.112661) 0.011688 / 0.007986 (0.003702) 0.004921 / 0.004328 (0.000592) 0.011732 / 0.004250 (0.007481) 0.055060 / 0.037052 (0.018007) 0.398213 / 0.258489 (0.139724) 0.439147 / 0.293841 (0.145306) 0.158305 / 0.128546 (0.029759) 0.126251 / 0.075646 (0.050605) 0.456789 / 0.419271 (0.037518) 0.419933 / 0.043533 (0.376401) 0.398202 / 0.255139 (0.143063) 0.421931 / 0.283200 (0.138731) 1.680163 / 0.141683 (1.538480) 1.906887 / 1.452155 (0.454732) 1.959349 / 1.492716 (0.466633)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.006465 / 0.018006 (-0.011541) 0.000561 / 0.000490 (0.000071) 0.000184 / 0.000200 (-0.000016) 0.000048 / 0.000054 (-0.000006)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.047024 / 0.037411 (0.009613) 0.022289 / 0.014526 (0.007763) 0.030202 / 0.176557 (-0.146354) 0.049937 / 0.737135 (-0.687198) 0.030917 / 0.296338 (-0.265422)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.424092 / 0.215209 (0.208883) 4.311547 / 2.077655 (2.233893) 2.300000 / 1.504120 (0.795880) 2.166291 / 1.541195 (0.625096) 2.221121 / 1.468490 (0.752631) 6.668090 / 4.584777 (2.083313) 5.996952 / 3.745712 (2.251240) 8.321086 / 5.269862 (3.051224) 7.326802 / 4.565676 (2.761125) 0.663548 / 0.424275 (0.239273) 0.010876 / 0.007607 (0.003269) 0.541485 / 0.226044 (0.315440) 5.397095 / 2.268929 (3.128166) 2.758916 / 55.444624 (-52.685709) 2.397987 / 6.876477 (-4.478489) 2.442473 / 2.142072 (0.300401) 6.846216 / 4.805227 (2.040989) 3.878391 / 6.500664 (-2.622273) 6.420079 / 0.075469 (6.344610)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 10.713706 / 1.841788 (8.871918) 14.076219 / 8.074308 (6.001911) 30.318086 / 10.191392 (20.126694) 0.934785 / 0.680424 (0.254362) 0.632912 / 0.534201 (0.098711) 0.757803 / 0.579283 (0.178519) 0.614218 / 0.434364 (0.179854) 0.694845 / 0.540337 (0.154508) 1.573175 / 1.386936 (0.186239)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.022909 / 0.011353 (0.011556) 0.015044 / 0.011008 (0.004036) 0.051444 / 0.038508 (0.012936) 0.040595 / 0.023109 (0.017486) 0.335234 / 0.275898 (0.059336) 0.379933 / 0.323480 (0.056453) 0.011320 / 0.007986 (0.003334) 0.004902 / 0.004328 (0.000574) 0.012059 / 0.004250 (0.007808) 0.060512 / 0.037052 (0.023460) 0.337350 / 0.258489 (0.078861) 0.383572 / 0.293841 (0.089731) 0.157500 / 0.128546 (0.028954) 0.116012 / 0.075646 (0.040365) 0.439529 / 0.419271 (0.020257) 0.420759 / 0.043533 (0.377226) 0.337722 / 0.255139 (0.082583) 0.366860 / 0.283200 (0.083661) 1.662852 / 0.141683 (1.521169) 1.896534 / 1.452155 (0.444379) 1.974053 / 1.492716 (0.481337)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.006414 / 0.018006 (-0.011592) 0.000495 / 0.000490 (0.000006) 0.000193 / 0.000200 (-0.000007) 0.000048 / 0.000054 (-0.000007)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042782 / 0.037411 (0.005371) 0.022994 / 0.014526 (0.008468) 0.029965 / 0.176557 (-0.146591) 0.049546 / 0.737135 (-0.687589) 0.031684 / 0.296338 (-0.264655)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.411937 / 0.215209 (0.196728) 4.108196 / 2.077655 (2.030541) 2.080431 / 1.504120 (0.576311) 1.861730 / 1.541195 (0.320536) 1.916828 / 1.468490 (0.448338) 6.515880 / 4.584777 (1.931103) 5.736251 / 3.745712 (1.990539) 8.229895 / 5.269862 (2.960033) 7.179900 / 4.565676 (2.614224) 0.648365 / 0.424275 (0.224090) 0.010681 / 0.007607 (0.003074) 0.528030 / 0.226044 (0.301985) 5.292842 / 2.268929 (3.023913) 2.559464 / 55.444624 (-52.885161) 2.185899 / 6.876477 (-4.690578) 2.232160 / 2.142072 (0.090087) 6.668583 / 4.805227 (1.863355) 3.977590 / 6.500664 (-2.523074) 4.575795 / 0.075469 (4.500326)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 10.629642 / 1.841788 (8.787854) 13.724528 / 8.074308 (5.650220) 29.654911 / 10.191392 (19.463519) 0.842432 / 0.680424 (0.162008) 0.593141 / 0.534201 (0.058940) 0.744424 / 0.579283 (0.165141) 0.556154 / 0.434364 (0.121790) 0.668952 / 0.540337 (0.128614) 1.534631 / 1.386936 (0.147695)

CML watermark

Please sign in to comment.