From a9c112def382b84295cdf9f25337ce9c680bdccc Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 16 Nov 2020 16:57:24 -0600 Subject: [PATCH] Add in basic support to read structs from parquet Signed-off-by: Robert (Bobby) Evans --- integration_tests/src/main/python/parquet_test.py | 4 +++- .../nvidia/spark/rapids/shims/spark300/Spark300Shims.scala | 1 + .../spark/rapids/shims/spark300db/Spark300dbShims.scala | 1 + .../spark/rapids/shims/spark301db/Spark301dbShims.scala | 1 + .../nvidia/spark/rapids/shims/spark310/Spark310Shims.scala | 1 + .../src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 1 + .../main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala | 1 + 7 files changed, 9 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index b33aa910a31..a787a173f8f 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -32,7 +32,9 @@ def read_parquet_sql(data_path): TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc)), ArrayGen(byte_gen), ArrayGen(long_gen), ArrayGen(string_gen), ArrayGen(date_gen), ArrayGen(TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc))), - ArrayGen(ArrayGen(byte_gen))], + ArrayGen(ArrayGen(byte_gen)), + StructGen([['child0', ArrayGen(byte_gen)], ['child1', byte_gen], ['child2', float_gen]]), + ArrayGen(StructGen([['child0', string_gen], ['child1', double_gen], ['child2', int_gen]]))], pytest.param([timestamp_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/132'))] # test with original parquet file reader, the multi-file parallel reader for cloud, and coalesce file reader for diff --git a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/Spark300Shims.scala b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/Spark300Shims.scala index 0fd4f9966c2..fe99a4bdc20 100644 --- a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/Spark300Shims.scala +++ b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/Spark300Shims.scala @@ -138,6 +138,7 @@ class Spark300Shims extends SparkShims { GpuOverrides.isSupportedType(t, allowArray = true, allowStringMaps = true, + allowStruct = true, allowNesting = true) // partition filters and data filters are not run on the GPU diff --git a/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala b/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala index 1b3e3ff282f..c38075067ca 100644 --- a/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala +++ b/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala @@ -94,6 +94,7 @@ class Spark300dbShims extends Spark300Shims { GpuOverrides.isSupportedType(t, allowArray = true, allowStringMaps = true, + allowStruct = true, allowNesting = true) // partition filters and data filters are not run on the GPU diff --git a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/Spark301dbShims.scala b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/Spark301dbShims.scala index 852f48b320f..f84daf29ae1 100644 --- a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/Spark301dbShims.scala +++ b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/Spark301dbShims.scala @@ -86,6 +86,7 @@ class Spark301dbShims extends Spark301Shims { GpuOverrides.isSupportedType(t, allowArray = true, allowStringMaps = true, + allowStruct = true, allowNesting = true) // partition filters and data filters are not run on the GPU diff --git a/shims/spark310/src/main/scala/com/nvidia/spark/rapids/shims/spark310/Spark310Shims.scala b/shims/spark310/src/main/scala/com/nvidia/spark/rapids/shims/spark310/Spark310Shims.scala index 2939ff18534..5750ea1da57 100644 --- a/shims/spark310/src/main/scala/com/nvidia/spark/rapids/shims/spark310/Spark310Shims.scala +++ b/shims/spark310/src/main/scala/com/nvidia/spark/rapids/shims/spark310/Spark310Shims.scala @@ -140,6 +140,7 @@ class Spark310Shims extends Spark301Shims { GpuOverrides.isSupportedType(t, allowArray = true, allowStringMaps = true, + allowStruct = true, allowNesting = true) // partition filters and data filters are not run on the GPU diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 0b09f9e8652..c676eb1efa1 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1857,6 +1857,7 @@ object GpuOverrides { GpuOverrides.isSupportedType(t, allowStringMaps = true, allowArray = true, + allowStruct = true, allowNesting = true) override def convertToGpu(): GpuExec = diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala index 38bb457eea5..ff6a46d2f07 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala @@ -138,6 +138,7 @@ object GpuParquetScanBase { field.dataType, allowStringMaps = true, allowArray = true, + allowStruct = true, allowNesting = true)) { meta.willNotWorkOnGpu(s"GpuParquetScan does not support fields of type ${field.dataType}") }