Merge branch 'master' into try-to-make-tests-run-faster

huggingface · Apr 29, 2021 · c604d37 · c604d37 · github-actions · Apr 29, 2021
2 parents 21cd684 + 74f87cc
commit c604d37
Show file tree

Hide file tree

Showing 23 changed files with 272 additions and 146 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -15,7 +15,7 @@ jobs:
             - run: source venv/bin/activate
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
-            - run: pip install pyarrow --upgrade
+            - run: pip install pyarrow==3.0.0
             - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/
 
     run_dataset_script_tests_pyarrow_1:
@@ -47,7 +47,7 @@ jobs:
             - run: "& venv/Scripts/activate.ps1"
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
-            - run: pip install pyarrow --upgrade
+            - run: pip install pyarrow==3.0.0
             - run: $env:HF_SCRIPTS_VERSION="master"
             - run: python -m pytest -sv ./tests/
 

diff --git a/datasets/newsph_nli/dataset_infos.json b/datasets/newsph_nli/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": "    @article{cruz2020investigating,\n      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n      journal={arXiv preprint arXiv:2010.11574},\n      year={2020}\n    }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
+{"default": {"description": "    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": "    @article{cruz2020investigating,\n      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n      journal={arXiv preprint arXiv:2010.11574},\n      year={2020}\n    }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}
diff --git a/datasets/newsph_nli/dummy/1.0.0/dummy_data.zip b/datasets/newsph_nli/dummy/1.0.0/dummy_data.zip
diff --git a/datasets/newsph_nli/newsph_nli.py b/datasets/newsph_nli/newsph_nli.py
@@ -21,22 +21,24 @@
 
 
 _DESCRIPTION = """\
-    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.
+First benchmark dataset for sentence entailment in the low-resource Filipino language.
+Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,
+in 70-15-15 split for training, validation, and testing.
 """
 
 _CITATION = """\
-    @article{cruz2020investigating,
-      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
-      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
-      journal={arXiv preprint arXiv:2010.11574},
-      year={2020}
-    }
+@article{cruz2020investigating,
+    title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
+    author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
+    journal={arXiv preprint arXiv:2010.11574},
+    year={2020}
+}
 """
 
 _HOMEPAGE = "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks"
 
 # TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0"
 
 _URL = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip"
 
@@ -68,7 +70,7 @@ def _split_generators(self, dl_manager):
         data_dir = dl_manager.download_and_extract(_URL)
         download_path = os.path.join(data_dir, "newsph-nli")
         train_path = os.path.join(download_path, "train.csv")
-        test_path = os.path.join(download_path, "train.csv")
+        test_path = os.path.join(download_path, "test.csv")
         validation_path = os.path.join(download_path, "valid.csv")
 
         return [

diff --git a/setup.py b/setup.py
@@ -74,7 +74,7 @@
     "numpy>=1.17",
     # Backend and serialization.
     # Minimum 1.0.0 to avoid permission errors on windows when using the compute layer on memory mapped data
-    "pyarrow>=1.0.0",
+    "pyarrow>=1.0.0<4.0.0",
     # For smart caching dataset processing
     "dill",
     # For performance gains with apache arrow

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -53,7 +53,7 @@
     update_fingerprint,
 )
 from .formatting import format_table, get_format_type_from_alias, get_formatter, query_table
-from .info import DATASET_INFO_FILENAME, DatasetInfo
+from .info import DatasetInfo
 from .search import IndexableMixin
 from .splits import NamedSplit
 from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables, list_table_cache_files
@@ -616,7 +616,9 @@ def save_to_disk(self, dataset_path: str, fs=None):
             Path(dataset_path, config.DATASET_STATE_JSON_FILENAME).as_posix(), "w", encoding="utf-8"
         ) as state_file:
             json.dump(state, state_file, indent=2, sort_keys=True)
-        with fs.open(Path(dataset_path, DATASET_INFO_FILENAME).as_posix(), "w", encoding="utf-8") as dataset_info_file:
+        with fs.open(
+            Path(dataset_path, config.DATASET_INFO_FILENAME).as_posix(), "w", encoding="utf-8"
+        ) as dataset_info_file:
             json.dump(dataset_info, dataset_info_file, indent=2, sort_keys=True)
         logger.info("Dataset saved in {}".format(dataset_path))
 
@@ -653,7 +655,9 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
             Path(dataset_path, config.DATASET_STATE_JSON_FILENAME).as_posix(), "r", encoding="utf-8"
         ) as state_file:
             state = json.load(state_file)
-        with open(Path(dataset_path, DATASET_INFO_FILENAME).as_posix(), "r", encoding="utf-8") as dataset_info_file:
+        with open(
+            Path(dataset_path, config.DATASET_INFO_FILENAME).as_posix(), "r", encoding="utf-8"
+        ) as dataset_info_file:
             dataset_info = DatasetInfo.from_dict(json.load(dataset_info_file))
 
         dataset_size = estimate_dataset_size(
@@ -771,6 +775,7 @@ def class_encode_column(self, column: str) -> "Dataset":
         class_names = sorted(dset.unique(column))
         dst_feat = ClassLabel(names=class_names)
         dset = dset.map(lambda batch: {column: dst_feat.str2int(batch)}, input_columns=column, batched=True)
+        dset = concatenate_datasets([self.remove_columns([column]), dset], axis=1)
 
         new_features = copy.deepcopy(dset.features)
         new_features[column] = dst_feat
@@ -838,7 +843,7 @@ def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
             else:
                 break
         dataset.info.features = Features.from_arrow_schema(dataset._data.schema)
-        self._data = update_metadata_with_features(self._data, self.features)
+        dataset._data = update_metadata_with_features(dataset._data, dataset.features)
         logger.info(
             "Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
         )
@@ -1018,7 +1023,7 @@ def remove_columns(self, column_names: Union[str, List[str]], new_fingerprint) -
             del dataset._info.features[column_name]
 
         dataset._data = dataset._data.drop(column_names)
-        dataset._data = update_metadata_with_features(dataset._data, self.features)
+        dataset._data = update_metadata_with_features(dataset._data, dataset.features)
         dataset._fingerprint = new_fingerprint
         return dataset
 
@@ -1109,7 +1114,7 @@ def rename(columns):
         )
 
         dataset._data = dataset._data.rename_columns(new_column_names)
-        dataset._data = update_metadata_with_features(dataset._data, self.features)
+        dataset._data = update_metadata_with_features(dataset._data, dataset.features)
         dataset._fingerprint = new_fingerprint
         return dataset
 
@@ -1771,8 +1776,18 @@ def init_buffer_and_writer():
         # Optionally initialize the writer as a context manager
         with contextlib.ExitStack() as stack:
             try:
+                # Only load the columns we actually need
+                if input_columns:
+                    input_dataset = self.with_format(
+                        self._format_type, columns=input_columns, output_all_columns=False, **self._format_kwargs
+                    )
+                    if remove_columns:
+                        remove_columns = list(set(remove_columns) & set(input_columns))
+                else:
+                    input_dataset = self
+
                 # Loop over single examples or batches and write to buffer/file if examples are to be updated
-                pbar_iterable = self if not batched else range(0, len(self), batch_size)
+                pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size)
                 pbar_unit = "ex" if not batched else "ba"
                 pbar_desc = "#" + str(rank) if rank is not None else None
                 pbar = tqdm(pbar_iterable, disable=not_verbose, position=rank, unit=pbar_unit, desc=pbar_desc)
@@ -1790,13 +1805,18 @@ def init_buffer_and_writer():
                                 writer.write(example)
                 else:
                     for i in pbar:
-                        if drop_last_batch and i + batch_size > self.num_rows:
+                        if drop_last_batch and i + batch_size > input_dataset.num_rows:
                             continue
-                        batch = self[i : i + batch_size]
-                        indices = list(range(*(slice(i, i + batch_size).indices(self.num_rows))))  # Something simpler?
+                        batch = input_dataset[i : i + batch_size]
+                        indices = list(
+                            range(*(slice(i, i + batch_size).indices(input_dataset.num_rows)))
+                        )  # Something simpler?
                         try:
                             batch = apply_function_on_filtered_inputs(
-                                batch, indices, check_same_num_examples=len(self.list_indexes()) > 0, offset=offset
+                                batch,
+                                indices,
+                                check_same_num_examples=len(input_dataset.list_indexes()) > 0,
+                                offset=offset,
                             )
                         except NumExamplesMismatch:
                             raise DatasetTransformationNotAllowedError(
@@ -2633,6 +2653,28 @@ def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Un
                 for offset in range(0, len(self), batch_size)
             )
 
+    def to_json(
+        self,
+        path_or_buf: Union[PathLike, BinaryIO],
+        batch_size: Optional[int] = None,
+        **to_json_kwargs,
+    ) -> int:
+        """Exports the dataset to JSON.
+
+        Args:
+            path_or_buf (``PathLike`` or ``FileOrBuffer``): Either a path to a file or a BinaryIO.
+            batch_size (Optional ``int``): Size of the batch to load in memory and write at once.
+                Defaults to :obj:`datasets.config.DEFAULT_MAX_BATCH_SIZE`.
+            to_json_kwargs: Parameters to pass to pandas's :func:`pandas.DataFrame.to_json`
+
+        Returns:
+            int: The number of characters or bytes written
+        """
+        # Dynamic import to avoid circular dependency
+        from .io.json import JsonDatasetWriter
+
+        return JsonDatasetWriter(self, path_or_buf, batch_size=batch_size, **to_json_kwargs).write()
+
     def to_pandas(
         self, batch_size: Optional[int] = None, batched: bool = False
     ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
@@ -2858,10 +2900,12 @@ def add_elasticsearch_index(
             )
         return self
 
-    def add_item(self, item: dict):
+    @transmit_format
+    @fingerprint_transform(inplace=False)
+    def add_item(self, item: dict, new_fingerprint: str):
         """Add item to Dataset.
 
-        .. versionadded:: 1.6
+        .. versionadded:: 1.7
 
         Args:
             item (dict): Item data to be added.
@@ -2875,7 +2919,19 @@ def add_item(self, item: dict):
         item_table = item_table.cast(schema)
         # Concatenate tables
         table = concat_tables([self._data, item_table])
-        return Dataset(table)
+        if self._indices is None:
+            indices_table = None
+        else:
+            item_indices_array = pa.array([len(self._data)], type=pa.uint64())
+            item_indices_table = InMemoryTable.from_arrays([item_indices_array], names=["indices"])
+            indices_table = concat_tables([self._indices, item_indices_table])
+        return Dataset(
+            table,
+            info=copy.deepcopy(self.info),
+            split=self.split,
+            indices_table=indices_table,
+            fingerprint=new_fingerprint,
+        )
 
 
 def concatenate_datasets(

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -36,14 +36,7 @@
 from .arrow_writer import ArrowWriter, BeamWriter
 from .dataset_dict import DatasetDict
 from .fingerprint import Hasher
-from .info import (
-    DATASET_INFO_FILENAME,
-    DATASET_INFOS_DICT_FILE_NAME,
-    LICENSE_FILENAME,
-    DatasetInfo,
-    DatasetInfosDict,
-    PostProcessedInfo,
-)
+from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
 from .naming import camelcase_to_snakecase, filename_prefix_for_split
 from .splits import Split, SplitDict, SplitGenerator
 from .utils.download_manager import DownloadManager, GenerateMode
@@ -55,12 +48,6 @@
 
 logger = get_logger(__name__)
 
-FORCE_REDOWNLOAD = GenerateMode.FORCE_REDOWNLOAD
-REUSE_CACHE_IF_EXISTS = GenerateMode.REUSE_CACHE_IF_EXISTS
-REUSE_DATASET_IF_EXISTS = GenerateMode.REUSE_DATASET_IF_EXISTS
-
-MAX_DIRECTORY_NAME_LENGTH = 255
-
 
 class InvalidConfigName(ValueError):
     pass
@@ -175,7 +162,7 @@ def create_config_id(self, config_kwargs: dict, custom_features: Optional[Featur
 
         if suffix:
             config_id = self.name + "-" + suffix
-            if len(config_id) > MAX_DIRECTORY_NAME_LENGTH:
+            if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:
                 config_id = self.name + "-" + Hasher.hash(suffix)
             return config_id
         else:
@@ -297,7 +284,7 @@ def manual_download_instructions(self) -> Optional[str]:
     @classmethod
     def get_all_exported_dataset_infos(cls) -> dict:
         """Empty dict if doesn't exist"""
-        dset_infos_file_path = os.path.join(cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME)
+        dset_infos_file_path = os.path.join(cls.get_imported_module_dir(), config.DATASETDICT_INFOS_FILENAME)
         if os.path.exists(dset_infos_file_path):
             return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
         return {}
@@ -496,7 +483,7 @@ def download_and_prepare(
             if download_config is None:
                 download_config = DownloadConfig(
                     cache_dir=os.path.join(self._cache_dir_root, "downloads"),
-                    force_download=bool(download_mode == FORCE_REDOWNLOAD),
+                    force_download=bool(download_mode == GenerateMode.FORCE_REDOWNLOAD),
                     use_etag=False,
                     use_auth_token=use_auth_token,
                 )  # We don't use etag for data files to speed up the process
@@ -515,7 +502,7 @@ def download_and_prepare(
         lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock")
         with FileLock(lock_path):
             data_exists = os.path.exists(self._cache_dir)
-            if data_exists and download_mode == REUSE_DATASET_IF_EXISTS:
+            if data_exists and download_mode == GenerateMode.REUSE_DATASET_IF_EXISTS:
                 logger.warning("Reusing dataset %s (%s)", self.name, self._cache_dir)
                 # We need to update the info in case some splits were added in the meantime
                 # for example when calling load_dataset from multiple workers.
@@ -1174,9 +1161,9 @@ def _save_info(self):
             import apache_beam as beam
 
             fs = beam.io.filesystems.FileSystems
-            with fs.create(os.path.join(self._cache_dir, DATASET_INFO_FILENAME)) as f:
+            with fs.create(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)) as f:
                 self.info._dump_info(f)
-            with fs.create(os.path.join(self._cache_dir, LICENSE_FILENAME)) as f:
+            with fs.create(os.path.join(self._cache_dir, config.LICENSE_FILENAME)) as f:
                 self.info._dump_license(f)
 
     def _prepare_split(self, split_generator, pipeline):