Skip to content

Commit

Permalink
Rename GenerateMode to DownloadMode (huggingface#3759)
Browse files Browse the repository at this point in the history
* Rename GenerateMode to DownloadMode

* Implement DeprecatedEnum

* Deprecate GenerateMode

* Move DeprecatedEnum to deprecation_utils

* Fix merge
  • Loading branch information
albertvillanova committed Feb 22, 2022
1 parent b676285 commit d00e71a
Show file tree
Hide file tree
Showing 12 changed files with 129 additions and 68 deletions.
2 changes: 1 addition & 1 deletion docs/source/cache.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ After you download a dataset, control how it is loaded by :func:`datasets.load_d
>>> from datasets import load_dataset
>>> dataset = load_dataset('squad', download_mode='force_redownload')
Refer to :class:`datasets.GenerateMode` for a full list of download modes.
Refer to :class:`datasets.DownloadMode` for a full list of download modes.

Cache files
-----------
Expand Down
2 changes: 1 addition & 1 deletion docs/source/package_reference/builder_classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Builder classes

.. autoclass:: datasets.DownloadManager

.. autoclass:: datasets.GenerateMode
.. autoclass:: datasets.DownloadMode

.. autoclass:: datasets.SplitGenerator

Expand Down
14 changes: 7 additions & 7 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from .naming import camelcase_to_snakecase, filename_prefix_for_split
from .splits import Split, SplitDict, SplitGenerator
from .utils import logging
from .utils.download_manager import DownloadManager, GenerateMode
from .utils.download_manager import DownloadManager, DownloadMode
from .utils.file_utils import DownloadConfig, is_remote_url
from .utils.filelock import FileLock
from .utils.info_utils import get_size_checksum_dict, verify_checksums, verify_splits
Expand Down Expand Up @@ -477,7 +477,7 @@ def get_imported_module_dir(cls):
def download_and_prepare(
self,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[GenerateMode] = None,
download_mode: Optional[DownloadMode] = None,
ignore_verifications: bool = False,
try_from_hf_gcs: bool = True,
dl_manager: Optional[DownloadManager] = None,
Expand All @@ -489,7 +489,7 @@ def download_and_prepare(
Args:
download_config (Optional ``datasets.DownloadConfig``: specific download configuration parameters.
download_mode (Optional `datasets.GenerateMode`): select the download/generate mode - Default to REUSE_DATASET_IF_EXISTS
download_mode (Optional `datasets.DownloadMode`): select the download/generate mode - Default to REUSE_DATASET_IF_EXISTS
ignore_verifications (bool): Ignore the verifications of the downloaded/processed dataset information (checksums/size/splits/...)
save_infos (bool): Save the dataset information (checksums/size/splits/...)
try_from_hf_gcs (bool): If True, it will try to download the already prepared dataset from the Hf google cloud storage
Expand All @@ -500,15 +500,15 @@ def download_and_prepare(
If True, will get token from ~/.huggingface.
"""
download_mode = GenerateMode(download_mode or GenerateMode.REUSE_DATASET_IF_EXISTS)
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
verify_infos = not ignore_verifications
base_path = base_path if base_path is not None else self.base_path
if dl_manager is None:
if download_config is None:
download_config = DownloadConfig(
cache_dir=self._cache_downloaded_dir,
force_download=bool(download_mode == GenerateMode.FORCE_REDOWNLOAD),
force_extract=bool(download_mode == GenerateMode.FORCE_REDOWNLOAD),
force_download=bool(download_mode == DownloadMode.FORCE_REDOWNLOAD),
force_extract=bool(download_mode == DownloadMode.FORCE_REDOWNLOAD),
use_etag=False,
use_auth_token=use_auth_token,
) # We don't use etag for data files to speed up the process
Expand All @@ -527,7 +527,7 @@ def download_and_prepare(
lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock")
with FileLock(lock_path):
data_exists = os.path.exists(self._cache_dir)
if data_exists and download_mode == GenerateMode.REUSE_DATASET_IF_EXISTS:
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
logger.warning(f"Reusing dataset {self.name} ({self._cache_dir})")
# We need to update the info in case some splits were added in the meantime
# for example when calling load_dataset from multiple workers.
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/commands/run_beam.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from datasets.builder import DatasetBuilder
from datasets.commands import BaseDatasetsCLICommand
from datasets.load import dataset_module_factory, import_main_class
from datasets.utils.download_manager import DownloadConfig, GenerateMode
from datasets.utils.download_manager import DownloadConfig, DownloadMode


def run_beam_command_factory(args):
Expand Down Expand Up @@ -122,9 +122,9 @@ def run(self):

for builder in builders:
builder.download_and_prepare(
download_mode=GenerateMode.REUSE_CACHE_IF_EXISTS
download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS
if not self._force_redownload
else GenerateMode.FORCE_REDOWNLOAD,
else DownloadMode.FORCE_REDOWNLOAD,
download_config=DownloadConfig(cache_dir=config.DOWNLOADED_DATASETS_PATH),
save_infos=self._save_infos,
ignore_verifications=self._ignore_verifications,
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/commands/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from datasets.builder import DatasetBuilder
from datasets.commands import BaseDatasetsCLICommand
from datasets.load import dataset_module_factory, import_main_class
from datasets.utils.download_manager import GenerateMode
from datasets.utils.download_manager import DownloadMode
from datasets.utils.filelock import logger as fl_logger
from datasets.utils.logging import ERROR, get_logger

Expand Down Expand Up @@ -154,9 +154,9 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
for j, builder in enumerate(get_builders()):
print(f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})")
builder.download_and_prepare(
download_mode=GenerateMode.REUSE_CACHE_IF_EXISTS
download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS
if not self._force_redownload
else GenerateMode.FORCE_REDOWNLOAD,
else DownloadMode.FORCE_REDOWNLOAD,
ignore_verifications=self._ignore_verifications,
try_from_hf_gcs=False,
)
Expand Down
18 changes: 9 additions & 9 deletions src/datasets/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
metric_module_factory,
)
from .utils import DownloadConfig
from .utils.download_manager import GenerateMode
from .utils.download_manager import DownloadMode
from .utils.logging import get_logger
from .utils.streaming_download_manager import StreamingDownloadManager
from .utils.version import Version
Expand Down Expand Up @@ -125,7 +125,7 @@ def get_dataset_infos(
path: str,
data_files: Optional[Union[Dict, List, str]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[GenerateMode] = None,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
**config_kwargs,
Expand All @@ -146,7 +146,7 @@ def get_dataset_infos(
- it will also try to load it from the master branch if it's not available at the local version of the lib.
Specifying a version that is different from your local version of the lib might cause compatibility issues.
download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`GenerateMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
download_mode (:class:`DownloadMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
use_auth_token (``str`` or ``bool``, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
If True, will get token from `"~/.huggingface"`.
Expand Down Expand Up @@ -178,7 +178,7 @@ def get_dataset_config_names(
path: str,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[GenerateMode] = None,
download_mode: Optional[DownloadMode] = None,
force_local_path: Optional[str] = None,
dynamic_modules_path: Optional[str] = None,
data_files: Optional[Union[Dict, List, str]] = None,
Expand All @@ -200,7 +200,7 @@ def get_dataset_config_names(
- it will also try to load it from the master branch if it's not available at the local version of the lib.
Specifying a version that is different from your local version of the lib might cause compatibility issues.
download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`GenerateMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
download_mode (:class:`DownloadMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
force_local_path (Optional str): Optional path to a local path to download and prepare the script to.
Used to inspect or modify the script folder.
dynamic_modules_path (Optional str, defaults to HF_MODULES_CACHE / "datasets_modules", i.e. ~/.cache/huggingface/modules/datasets_modules):
Expand Down Expand Up @@ -229,7 +229,7 @@ def get_dataset_config_info(
config_name: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[GenerateMode] = None,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
**config_kwargs,
Expand All @@ -246,7 +246,7 @@ def get_dataset_config_info(
config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`GenerateMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
download_mode (:class:`DownloadMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load:
- For canonical datasets in the `huggingface/datasets` library like "squad", the default version of the module is the local version of the lib.
Expand Down Expand Up @@ -291,7 +291,7 @@ def get_dataset_split_names(
config_name: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[GenerateMode] = None,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
**config_kwargs,
Expand All @@ -308,7 +308,7 @@ def get_dataset_split_names(
config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`GenerateMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
download_mode (:class:`DownloadMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load:
- For canonical datasets in the `huggingface/datasets` library like "squad", the default version of the module is the local version of the lib.
Expand Down
Loading

0 comments on commit d00e71a

Please sign in to comment.