huggingface · lhoestq · Dec 6, 2021 · Oct 21, 2021 · Oct 23, 2021 · Oct 23, 2021
diff --git a/datasets/arabic_speech_corpus/arabic_speech_corpus.py b/datasets/arabic_speech_corpus/arabic_speech_corpus.py
@@ -85,7 +85,7 @@ def _info(self):
                 {
                     "file": datasets.Value("string"),
                     "text": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=48_000),
+                    "audio": datasets.Audio(sampling_rate=48_000),
                     "phonetic": datasets.Value("string"),
                     "orthographic": datasets.Value("string"),
                 }

diff --git a/datasets/common_language/common_language.py b/datasets/common_language/common_language.py
@@ -110,7 +110,7 @@ def _info(self):
             {
                 "client_id": datasets.Value("string"),
                 "path": datasets.Value("string"),
-                "audio": datasets.features.Audio(sampling_rate=48_000),
+                "audio": datasets.Audio(sampling_rate=48_000),
                 "sentence": datasets.Value("string"),
                 "age": datasets.Value("string"),
                 "gender": datasets.Value("string"),

diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
@@ -631,7 +631,7 @@ def _info(self):
             {
                 "client_id": datasets.Value("string"),
                 "path": datasets.Value("string"),
-                "audio": datasets.features.Audio(sampling_rate=48_000),
+                "audio": datasets.Audio(sampling_rate=48_000),
                 "sentence": datasets.Value("string"),
                 "up_votes": datasets.Value("int64"),
                 "down_votes": datasets.Value("int64"),

diff --git a/datasets/covost2/covost2.py b/datasets/covost2/covost2.py
@@ -96,7 +96,7 @@ def _info(self):
             features=datasets.Features(
                 client_id=datasets.Value("string"),
                 file=datasets.Value("string"),
-                audio=datasets.features.Audio(sampling_rate=16_000),
+                audio=datasets.Audio(sampling_rate=16_000),
                 sentence=datasets.Value("string"),
                 translation=datasets.Value("string"),
                 id=datasets.Value("string"),

diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py
@@ -102,7 +102,7 @@ def _info(self):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "text": datasets.Value("string"),
                     "speaker_id": datasets.Value("int64"),
                     "chapter_id": datasets.Value("int64"),

diff --git a/datasets/lj_speech/lj_speech.py b/datasets/lj_speech/lj_speech.py
@@ -74,7 +74,7 @@ def _info(self):
             features=datasets.Features(
                 {
                     "id": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=22050),
+                    "audio": datasets.Audio(sampling_rate=22050),
                     "file": datasets.Value("string"),
                     "text": datasets.Value("string"),
                     "normalized_text": datasets.Value("string"),

diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
@@ -538,7 +538,7 @@ def _info(self):
         features = datasets.Features(
             {
                 "path": datasets.Value("string"),
-                "audio": datasets.features.Audio(sampling_rate=48_000),
+                "audio": datasets.Audio(sampling_rate=48_000),
                 "sentence": datasets.Value("string"),
             }
         )

diff --git a/datasets/superb/superb.py b/datasets/superb/superb.py
@@ -137,7 +137,7 @@ class Superb(datasets.GeneratorBasedBuilder):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "text": datasets.Value("string"),
                     "speaker_id": datasets.Value("int64"),
                     "chapter_id": datasets.Value("int64"),
@@ -162,7 +162,7 @@ class Superb(datasets.GeneratorBasedBuilder):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "label": datasets.ClassLabel(
                         names=[
                             "yes",
@@ -196,7 +196,7 @@ class Superb(datasets.GeneratorBasedBuilder):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "speaker_id": datasets.Value("string"),
                     "text": datasets.Value("string"),
                     "action": datasets.ClassLabel(
@@ -238,7 +238,7 @@ class Superb(datasets.GeneratorBasedBuilder):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     # VoxCeleb1 contains 1251 speaker IDs in range ["id10001",..."id11251"]
                     "label": datasets.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]),
                 }
@@ -261,7 +261,7 @@ class Superb(datasets.GeneratorBasedBuilder):
                 {
                     "record_id": datasets.Value("string"),
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "start": datasets.Value("int64"),
                     "end": datasets.Value("int64"),
                     "speakers": [
@@ -289,7 +289,7 @@ class Superb(datasets.GeneratorBasedBuilder):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "label": datasets.ClassLabel(names=["neu", "hap", "ang", "sad"]),
                 }
             ),

diff --git a/datasets/timit_asr/timit_asr.py b/datasets/timit_asr/timit_asr.py
@@ -77,7 +77,7 @@ def _info(self):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "text": datasets.Value("string"),
                     "phonetic_detail": datasets.Sequence(
                         {

diff --git a/datasets/vivos/vivos.py b/datasets/vivos/vivos.py
@@ -67,7 +67,7 @@ def _info(self):
                 {
                     "speaker_id": datasets.Value("string"),
                     "path": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": datasets.Audio(sampling_rate=16_000),
                     "sentence": datasets.Value("string"),
                 }
             ),

diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst
@@ -125,6 +125,9 @@ Dictionary with split names as keys ('train', 'test' for example), and :obj:`dat
 .. autoclass:: datasets.Audio
     :members:
 
+.. autoclass:: datasets.Image
+    :members:
+
 ``MetricInfo``
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/setup.py b/setup.py
@@ -62,10 +62,7 @@
    Push the commit to remote: "git push origin master"
 """
 
-import datetime
-import itertools
 import os
-import sys
 
 from setuptools import find_packages, setup
 
@@ -108,6 +105,10 @@
     "librosa",
 ]
 
+VISION_REQURE = [
+    "Pillow>=6.2.1",
+]
+
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
@@ -167,6 +168,8 @@
     "importlib_resources;python_version<'3.7'",
 ]
 
+TESTS_REQUIRE.extend(VISION_REQURE)
+
 if os.name != "nt":
     # dependencies of unbabel-comet
     # only test if not on windows since there're issues installing fairseq on windows
@@ -185,6 +188,7 @@
 
 EXTRAS_REQUIRE = {
     "audio": AUDIO_REQUIRE,
+    "vision": VISION_REQURE,
     "apache-beam": ["apache-beam>=2.26.0"],
     "tensorflow": ["tensorflow>=2.2.0,!=2.6.0,!=2.6.1"],
     "tensorflow_gpu": ["tensorflow-gpu>=2.2.0,!=2.6.0,!=2.6.1"],

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -45,6 +45,7 @@
     Audio,
     ClassLabel,
     Features,
+    Image,
     Sequence,
     Translation,
     TranslationVariableLanguages,

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -46,7 +46,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Sequence, Value, _ArrayXD
+from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, pandas_types_mapper
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -3280,15 +3280,15 @@ def to_pandas(
                 table=self._data,
                 key=slice(0, len(self)),
                 indices=self._indices if self._indices is not None else None,
-            ).to_pandas()
+            ).to_pandas(types_mapper=pandas_types_mapper)
         else:
             batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE
             return (
                 query_table(
                     table=self._data,
                     key=slice(offset, offset + batch_size),
                     indices=self._indices if self._indices is not None else None,
-                ).to_pandas()
+                ).to_pandas(types_mapper=pandas_types_mapper)
                 for offset in range(0, len(self), batch_size)
             )
 

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -17,6 +17,7 @@
 import json
 import os
 import socket
+import sys
 from dataclasses import asdict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -26,10 +27,12 @@
 from . import config, utils
 from .features import (
     Features,
+    ImageExtensionType,
     _ArrayXDExtensionType,
     cast_to_python_objects,
     list_of_np_array_to_pyarrow_listarray,
     numpy_to_pyarrow_listarray,
+    objects_to_list_of_image_dicts,
 )
 from .info import DatasetInfo
 from .keyhash import DuplicatedKeysError, KeyHasher
@@ -49,7 +52,7 @@ class TypedSequence:
     More specifically it adds several features:
     - Support extension types like ``datasets.features.Array2DExtensionType``:
         By default pyarrow arrays don't return extension arrays. One has to call
-        ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type_name))``
+        ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
         in order to get an extension array.
     - Support for ``try_type`` parameter that can be used instead of ``type``:
         When an array is transformed, we like to keep the same type as before if possible.
@@ -93,6 +96,10 @@ def __init__(self, data, type=None, try_type=None, optimized_int_type=None):
 
     def __arrow_array__(self, type=None):
         """This function is called when calling pa.array(typed_sequence)"""
+
+        if config.PIL_AVAILABLE and "PIL" in sys.modules:
+            import PIL.Image
+
         if type is not None:
             raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
         trying_type = False
@@ -104,6 +111,9 @@ def __arrow_array__(self, type=None):
         else:
             type = self.type
         trying_int_optimization = False
+        if type is None:  # automatic type inference for custom objects
+            if config.PIL_AVAILABLE and "PIL" in sys.modules and isinstance(self.data[0], PIL.Image.Image):
+                type = ImageExtensionType()
         try:
             if isinstance(type, _ArrayXDExtensionType):
                 if isinstance(self.data, np.ndarray):
@@ -113,6 +123,9 @@ def __arrow_array__(self, type=None):
                 else:
                     storage = pa.array(self.data, type.storage_dtype)
                 out = pa.ExtensionArray.from_storage(type, storage)
+            elif isinstance(type, ImageExtensionType):
+                storage = pa.array(objects_to_list_of_image_dicts(self.data), type=type.storage_type)
+                out = pa.ExtensionArray.from_storage(type, storage)
             elif isinstance(self.data, np.ndarray):
                 out = numpy_to_pyarrow_listarray(self.data)
                 if type is not None:
@@ -123,7 +136,7 @@ def __arrow_array__(self, type=None):
                     out = out.cast(type)
             else:
                 out = pa.array(cast_to_python_objects(self.data, only_1d_for_numpy=True), type=type)
-            if trying_type:
+            if trying_type and not isinstance(type, ImageExtensionType):
                 is_equal = (
                     np.array_equal(np.array(out[0].as_py()), self.data[0])
                     if isinstance(self.data[0], np.ndarray)

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -123,6 +123,10 @@
     logger.info("Disabling Apache Beam because USE_BEAM is set to False")
 
 
+# Optional tools for feature decoding
+PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
+
+
 # Optional compression tools
 RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
 ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None

diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py
@@ -8,4 +8,5 @@
     _cast_to_python_objects,
     _is_zero_copy_only,
 )
+from .image import Image, ImageExtensionType, objects_to_list_of_image_dicts
 from .translation import Translation, TranslationVariableLanguages
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -37,6 +37,7 @@
 
 from datasets import config, utils
 from datasets.features.audio import Audio
+from datasets.features.image import Image, ImageExtensionType, PandasImageExtensionDtype
 from datasets.features.translation import Translation, TranslationVariableLanguages
 from datasets.utils.logging import get_logger
 
@@ -175,6 +176,9 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> Tuple[Any, boo
     if config.JAX_AVAILABLE and "jax" in sys.modules:
         import jax.numpy as jnp
 
+    if config.PIL_AVAILABLE and "PIL" in sys.modules:
+        import PIL.Image
+
     if isinstance(obj, np.ndarray):
         if not only_1d_for_numpy or obj.ndim == 1:
             return obj, False
@@ -197,6 +201,11 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> Tuple[Any, boo
             return np.asarray(obj), True
         else:
             return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in np.asarray(obj)], True
+    elif config.PIL_AVAILABLE and "PIL" in sys.modules and isinstance(obj, PIL.Image.Image):
+        if not only_1d_for_numpy:
+            return obj, False
+        else:
+            return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in np.array(obj)], True
     elif isinstance(obj, pd.Series):
         return obj.values.tolist(), True
     elif isinstance(obj, pd.DataFrame):
@@ -471,7 +480,7 @@ class PandasArrayExtensionDtype(PandasExtensionDtype):
     def __init__(self, value_type: Union["PandasArrayExtensionDtype", np.dtype]):
         self._value_type = value_type
 
-    def __from_arrow__(self, array):
+    def __from_arrow__(self, array: Union[pa.Array, pa.ChunkedArray]):
         if array.type.shape[0] is None:
             raise NotImplementedError(
                 "Dynamic first dimension is not supported for "
@@ -567,7 +576,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray,
     def take(
         self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None
     ) -> "PandasArrayExtensionArray":
-        indices: np.ndarray = np.asarray(indices, dtype="int")
+        indices: np.ndarray = np.asarray(indices, dtype=np.int)
         if allow_fill:
             fill_value = (
                 self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type)
@@ -599,6 +608,8 @@ def __eq__(self, other) -> np.ndarray:
 def pandas_types_mapper(dtype):
     if isinstance(dtype, _ArrayXDExtensionType):
         return PandasArrayExtensionDtype(dtype.value_type)
+    elif isinstance(dtype, ImageExtensionType):
+        return PandasImageExtensionDtype()
 
 
 @dataclass
@@ -759,6 +770,7 @@ class Sequence:
     Array4D,
     Array5D,
     Audio,
+    Image,
 ]
 
 
@@ -849,7 +861,7 @@ def encode_nested_example(schema, obj):
             return list(obj)
     # Object with special encoding:
     # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks
-    elif isinstance(schema, (Audio, ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)):
+    elif isinstance(schema, (Audio, Image, ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)):
         return schema.encode_example(obj)
     # Other object should be directly convertible to a native Arrow type (like Translation and Translation)
     return obj
@@ -903,6 +915,8 @@ def generate_from_arrow_type(pa_type: pa.DataType) -> FeatureType:
     elif isinstance(pa_type, _ArrayXDExtensionType):
         array_feature = [None, None, Array2D, Array3D, Array4D, Array5D][pa_type.ndims]
         return array_feature(shape=pa_type.shape, dtype=pa_type.value_type)
+    elif isinstance(pa_type, ImageExtensionType):
+        return Image()
     elif isinstance(pa_type, pa.DictionaryType):
         raise NotImplementedError  # TODO(thom) this will need access to the dictionary as well (for labels). I.e. to the py_table
     elif isinstance(pa_type, pa.DataType):
@@ -963,6 +977,8 @@ class Features(dict):
         - a :class:`Array2D`, :class:`Array3D`, :class:`Array4D` or :class:`Array5D` feature for multidimensional arrays
         - an :class:`Audio` feature to store the absolute path to an audio file or a dictionary with the relative path
           to an audio file ("path" key) and its bytes content ("bytes" key). This feature extracts the audio data.
+        - an :class:`Image` feature to store the absolute path to an image file, an :obj:`np.ndarray` object, a :obj:`PIL.Image.Image` object
+          or a dictionary with the relative path to an image file ("path" key) and its bytes content ("bytes" key). This feature extracts the image data.
         - :class:`datasets.Translation` and :class:`datasets.TranslationVariableLanguages`, the two features specific to Machine Translation
     """