rapidsai · rapids-bot · Aug 13, 2021 · Apr 21, 2021 · May 27, 2021 · May 27, 2021
@@ -7,7 +7,7 @@
 from pyarrow import parquet as pq
 
 from dask import dataframe as dd
-from dask.dataframe.io.parquet.arrow import ArrowEngine
+from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine
 
 try:
     from dask.dataframe.io.parquet import (
@@ -17,14 +17,16 @@
     create_metadata_file_dd = None
 
 import cudf
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import as_column
 from cudf.io import write_to_dataset
 
 
-class CudfEngine(ArrowEngine):
+class CudfEngine(ArrowDatasetEngine):
     @staticmethod
     def read_metadata(*args, **kwargs):
-        meta, stats, parts, index = ArrowEngine.read_metadata(*args, **kwargs)
+        meta, stats, parts, index = ArrowDatasetEngine.read_metadata(
+            *args, **kwargs
+        )
 
         # If `strings_to_categorical==True`, convert objects to int32
         strings_to_cats = kwargs.get("strings_to_categorical", False)
@@ -59,7 +61,6 @@ def read_partition(
             pieces = [pieces]
 
         strings_to_cats = kwargs.get("strings_to_categorical", False)
-
         if len(pieces) > 1:
 
             paths = []
@@ -72,6 +73,9 @@ def read_partition(
                     rgs.append(None)
                 else:
                     (path, row_group, partition_keys) = piece
+
+                    row_group = None if row_group == [None] else row_group
+
                     paths.append(path)
                     rgs.append(
                         [row_group]
@@ -96,6 +100,7 @@ def read_partition(
                 partition_keys = []
             else:
                 (path, row_group, partition_keys) = pieces[0]
+                row_group = None if row_group == [None] else row_group
 
             if cudf.utils.ioutils._is_local_filesystem(fs):
                 df = cudf.read_parquet(
@@ -127,18 +132,17 @@ def read_partition(
         if partition_keys:
             if partitions is None:
                 raise ValueError("Must pass partition sets")
+
             for i, (name, index2) in enumerate(partition_keys):
-                categories = [
-                    val.as_py() for val in partitions.levels[i].dictionary
-                ]
+
+                categories = partitions[i].keys
 
                 col = as_column(index2).as_frame().repeat(len(df))._data[None]
-                df[name] = build_categorical_column(
-                    categories=categories,
-                    codes=as_column(col.base_data, dtype=col.dtype),
-                    size=col.size,
-                    offset=col.offset,
-                    ordered=False,
+
+                df[name] = col.as_categorical_column(
+                    cudf.CategoricalDtype(
+                        categories=categories, ordered=False,
+                    )
                 )
 
         return df
@@ -243,9 +247,9 @@ def read_parquet(
     """ Read parquet files into a Dask DataFrame
 
     Calls ``dask.dataframe.read_parquet`` to cordinate the execution of
-    ``cudf.read_parquet``, and ultimately read multiple partitions into a
-    single Dask dataframe. The Dask version must supply an ``ArrowEngine``
-    class to support full functionality.
+    ``cudf.read_parquet``, and ultimately read multiple partitions into
+    a single Dask dataframe. The Dask version must supply an
+    ``ArrowDatasetEngine`` class to support full functionality.
     See ``cudf.read_parquet`` and Dask documentation for further details.
 
     Examples

@@ -455,17 +455,20 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     p1 = os.path.join(tmpdir, "part.1.parquet")
     df1.to_parquet(p1, engine="pyarrow")
 
-    with pytest.raises(RuntimeError):
-        # Pyarrow will fail to aggregate metadata
-        # if gather_statistics=True
-        dask_cudf.read_parquet(str(tmpdir), gather_statistics=True,).compute()
+    # New pyarrow-dataset base can handle an incosistent
+    # schema even without a _metadata file
+    ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
 
     # Add global metadata file.
     # Dask-CuDF can do this without requiring schema
-    # consistency.  Once the _metadata file is avaible,
-    # parsing metadata should no longer be a problem
+    # consistency.
     dask_cudf.io.parquet.create_metadata_file([p0, p1])
 
-    # Check that we can now read the ddf
+    # Check that we can still read the ddf
     # with the _metadata file present
-    dask_cudf.read_parquet(str(tmpdir), gather_statistics=True,).compute()
+    ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
+
+    # Check that the result is the same with
+    # and without the _metadata file
+    dd.assert_eq(ddf1, ddf2, check_dtypes=False)
+    dd.assert_eq(ddf1.compute(), ddf2.compute())