NVIDIA · razajafri · Aug 18, 2023 · Aug 11, 2023 · Aug 11, 2023 · Aug 16, 2023
diff --git a/dist/unshimmed-common-from-spark311.txt b/dist/unshimmed-common-from-spark311.txt
@@ -28,6 +28,7 @@ com/nvidia/spark/rapids/SparkShimVersion*
 com/nvidia/spark/rapids/SparkShims*
 com/nvidia/spark/rapids/optimizer/SQLOptimizerPlugin*
 com/nvidia/spark/udf/Plugin*
+org/apache/spark/sql/rapids/AdaptiveSparkPlanHelperShim*
 org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback*
 org/apache/spark/sql/rapids/ProxyRapidsShuffleInternalManagerBase*
 org/apache/spark/sql/rapids/execution/Unshimmed*

diff --git a/integration_tests/src/main/python/prune_partition_column_test.py b/integration_tests/src/main/python/prune_partition_column_test.py
@@ -15,10 +15,12 @@
 import os
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, run_with_cpu_and_gpu, assert_equal
 from data_gen import *
 from marks import *
-from spark_session import with_cpu_session
+from pyspark.sql.types import IntegerType
+from spark_session import with_cpu_session, is_before_spark_320
+from conftest import spark_jvm
 
 # Several values to avoid generating too many folders for partitions.
 part1_gen = SetValuesGen(IntegerType(), [-10, -1, 0, 1, 10])
@@ -127,3 +129,80 @@ def test_prune_partition_column_when_filter_fallback_project(spark_tmp_path, pru
                                                              filter_col, file_format):
     do_prune_partition_column_when_filter_project(spark_tmp_path, prune_part_enabled, file_format,
                                                   filter_col, gpu_project_enabled=False)
+
+# This method creates two tables and saves them to partitioned Parquet/ORC files. The file is then
+# read in using the read function that is passed in
+def create_contacts_table_and_read(is_partitioned, format, data_path, expected_schemata, func, conf, table_name):
+    full_name_type = StructGen([('first', StringGen()), ('middle', StringGen()), ('last', StringGen())])
+    name_type = StructGen([('first', StringGen()), ('last', StringGen())])
+    contacts_data_gen = StructGen([
+        ('id', IntegerGen()),
+        ('name', full_name_type),
+        ('address', StringGen()),
+        ('friends', ArrayGen(full_name_type, max_length=10, nullable=False))], nullable=False)
+
+    brief_contacts_data_gen = StructGen([
+        ('id', IntegerGen()),
+        ('name', name_type),
+        ('address', StringGen())], nullable=False)
+
+    # We are adding the field 'p' twice just like it is being done in Spark tests
+    # https://github.com/apache/spark/blob/85e252e8503534009f4fb5ea005d44c9eda31447/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala#L193
+    def contact_gen_df(spark, data_gen, partition):
+        gen = gen_df(spark, data_gen)
+        if is_partitioned:
+            return gen.withColumn('p', f.lit(partition))
+        else:
+            return gen
+
+    with_cpu_session(lambda spark: contact_gen_df(spark, contacts_data_gen, 1).write.format(format).save(data_path + f"/{table_name}/p=1"))
+    with_cpu_session(lambda spark: contact_gen_df(spark, brief_contacts_data_gen, 2).write.format(format).save(data_path + f"/{table_name}/p=2"))
+
+    # Schema to read in.
+    read_schema = contacts_data_gen.data_type.add("p", IntegerType(), True) if is_partitioned else contacts_data_gen.data_type
+
+    (from_cpu, cpu_df), (from_gpu, gpu_df) = run_with_cpu_and_gpu(
+        func(read_schema),
+        'COLLECT_WITH_DATAFRAME',
+        conf=conf)
+
+    jvm = spark_jvm()
+    jvm.org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback.assertSchemataMatch(cpu_df._jdf, gpu_df._jdf, expected_schemata)
+    assert_equal(from_cpu, from_gpu)
+
+# https://github.com/NVIDIA/spark-rapids/issues/8712
+# https://github.com/NVIDIA/spark-rapids/issues/8713
+# https://github.com/NVIDIA/spark-rapids/issues/8714
+@pytest.mark.parametrize('query,expected_schemata', [("select friends.middle, friends from {} where p=1", "struct<friends:array<struct<first:string,middle:string,last:string>>>"),
+                                                     pytest.param("select name.middle, address from {} where p=2", "struct<name:struct<middle:string>,address:string>", marks=pytest.mark.skip(reason='https://github.com/NVIDIA/spark-rapids/issues/8788')),
+                                                     ("select name.first from {} where name.first = 'Jane'", "struct<name:struct<first:string>>")])
+@pytest.mark.parametrize('is_partitioned', [True, False])
+@pytest.mark.parametrize('format', ["parquet", "orc"])
+def test_select_complex_field(format, spark_tmp_path, query, expected_schemata, is_partitioned, spark_tmp_table_factory):
+    table_name = spark_tmp_table_factory.get()
+    data_path = spark_tmp_path + "/DATA"
+    def read_temp_view(schema):
+        def do_it(spark):
+            spark.read.format(format).schema(schema).load(data_path + f"/{table_name}").createOrReplaceTempView(table_name)
+            return spark.sql(query.format(table_name))
+        return do_it
+    conf={"spark.sql.parquet.enableVectorizedReader": "true"}
+    create_contacts_table_and_read(is_partitioned, format, data_path, expected_schemata, read_temp_view, conf, table_name)
+
+# https://github.com/NVIDIA/spark-rapids/issues/8715
+@pytest.mark.parametrize('query, expected_schemata', [("friend.First", "struct<friends:array<struct<first:string>>>"),
+                                                          ("friend.MIDDLE", "struct<friends:array<struct<middle:string>>>")])
+@pytest.mark.skipif(is_before_spark_320(), reason='https://issues.apache.org/jira/browse/SPARK-34638')
+@pytest.mark.parametrize('is_partitioned', [True, False])
+@pytest.mark.parametrize('format', ["parquet", "orc"])
+def test_nested_column_prune_on_generator_output(format, spark_tmp_path, query, expected_schemata, is_partitioned, spark_tmp_table_factory):
+    table_name = spark_tmp_table_factory.get()
+    data_path = spark_tmp_path + "/DATA"
+    def read_temp_view(schema):
+        def do_it(spark):
+            spark.read.format(format).schema(schema).load(data_path + f"/{table_name}").createOrReplaceTempView(table_name)
+            return spark.table(table_name).select(f.explode(f.col("friends")).alias("friend")).select(query)
+        return do_it
+    conf = {"spark.sql.caseSensitive": "false",
+            "spark.sql.parquet.enableVectorizedReader": "true"}
+    create_contacts_table_and_read(is_partitioned, format, data_path, expected_schemata, read_temp_view, conf, table_name)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AdaptiveSparkPlanHelperImpl.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AdaptiveSparkPlanHelperImpl.scala
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.rapids.AdaptiveSparkPlanHelperShim
+
+class AdaptiveSparkPlanHelperImpl extends AdaptiveSparkPlanHelperShim with AdaptiveSparkPlanHelper
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.Strategy
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{ColumnarRule, SparkPlan}
+import org.apache.spark.sql.rapids.AdaptiveSparkPlanHelperShim
 import org.apache.spark.sql.rapids.execution.UnshimmedTrampolineUtil
 import org.apache.spark.util.MutableURLClassLoader
 
@@ -415,4 +416,9 @@ object ShimLoader extends Logging {
   def loadGpuColumnVector(): Class[_] = {
     ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.GpuColumnVector")
   }
+
+  def newAdaptiveSparkPlanHelperShim(): AdaptiveSparkPlanHelperShim =
+    ShimReflectionUtils.newInstanceOf[AdaptiveSparkPlanHelperShim](
+      "com.nvidia.spark.rapids.AdaptiveSparkPlanHelperImpl"
+    )
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback.scala
@@ -20,7 +20,7 @@ import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 import scala.util.matching.Regex
 
-import com.nvidia.spark.rapids.{PlanShims, PlanUtils}
+import com.nvidia.spark.rapids.{PlanShims, PlanUtils, ShimLoader}
 
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -83,6 +83,45 @@ object ExecutionPlanCaptureCallback {
     fallbackCpuClassList.foreach(fallbackCpuClass => assertDidFallBack(gpuPlans, fallbackCpuClass))
   }
 
+  /**
+   * This method is used by the Python integration tests.
+   * The method checks the schemata used in the GPU and CPU executed plans and compares it to the
+   * expected schemata to make sure we are not reading more data than needed
+   */
+  def assertSchemataMatch(cpuDf: DataFrame, gpuDf: DataFrame, expectedSchema: String): Unit = {
+    import org.apache.spark.sql.execution.FileSourceScanExec
+    import org.apache.spark.sql.types.StructType
+    import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+
+    val adaptiveSparkPlanHelper = ShimLoader.newAdaptiveSparkPlanHelperShim()
+    val cpuFileSourceScanSchemata =
+      adaptiveSparkPlanHelper.collect(cpuDf.queryExecution.executedPlan) {
+      case scan: FileSourceScanExec => scan.requiredSchema
+    }
+    val gpuFileSourceScanSchemata =
+      adaptiveSparkPlanHelper.collect(gpuDf.queryExecution.executedPlan) {
+      case scan: GpuFileSourceScanExec => scan.requiredSchema
+    }
+    assert(cpuFileSourceScanSchemata.size == gpuFileSourceScanSchemata.size,
+      s"Found ${cpuFileSourceScanSchemata.size} file sources in dataframe, " +
+        s"but expected ${gpuFileSourceScanSchemata.size}")
+
+    cpuFileSourceScanSchemata.zip(gpuFileSourceScanSchemata).foreach {
+      case (cpuScanSchema, gpuScanSchema) =>
+         cpuScanSchema match {
+           case otherType: StructType =>
+             assert(gpuScanSchema.sameType(otherType))
+             val expectedStructType = CatalystSqlParser.parseDataType(expectedSchema)
+             assert(gpuScanSchema.sameType(expectedStructType),
+               s"Type GPU schema ${gpuScanSchema.toDDL} doesn't match $expectedSchema")
+             assert(cpuScanSchema.sameType(expectedStructType),
+               s"Type CPU schema ${cpuScanSchema.toDDL} doesn't match $expectedSchema")
+           case otherType => assert(false, s"The expected type $cpuScanSchema" +
+             s" doesn't match the actual type $otherType")
+         }
+    }
+  }
+
   def assertCapturedAndGpuFellBack(fallbackCpuClass: String, timeoutMs: Long = 2000): Unit = {
     val gpuPlans = getResultsWithTimeout(timeoutMs = timeoutMs)
     assert(gpuPlans.nonEmpty, "Did not capture a plan")
@@ -209,4 +248,8 @@ class ExecutionPlanCaptureCallback extends QueryExecutionListener {
 
   override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit =
     captureIfNeeded(qe)
+}
+
+trait AdaptiveSparkPlanHelperShim {
+  def collect[B](p: SparkPlan)(pf: PartialFunction[SparkPlan, B]): Seq[B]
 }