NVIDIA · NVnavkumar · Sep 12, 2022 · Aug 18, 2022 · Aug 18, 2022 · Aug 18, 2022
diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from pyspark.sql.functions import when, col
+from pyspark.sql.types import *
+from asserts import assert_gpu_and_cpu_are_equal_collect
+from data_gen import *
+from marks import ignore_order, allow_non_gpu
+from spark_session import with_cpu_session
+
+_adaptive_conf = { "spark.sql.adaptive.enabled": "true" }
+
+def create_skew_df(spark, length):
+    root = spark.range(0, length)
+    mid = length / 2
+    left = root.select(
+        when(col('id') < mid / 2, mid).
+            otherwise('id').alias("key1"),
+        col('id').alias("value1")
+    )
+    right = root.select(
+        when(col('id') < mid, mid).
+            otherwise('id').alias("key2"),
+        col('id').alias("value2")
+    )
+    return left, right
+
+
+# This replicates the skew join test from scala tests, and is here to test
+# the computeStats(...) implementation in GpuRangeExec
+@ignore_order(local=True)
+def test_aqe_skew_join():
+    def do_join(spark):
+        left, right = create_skew_df(spark, 500)
+        left.createOrReplaceTempView("skewData1")
+        right.createOrReplaceTempView("skewData2")
+        return spark.sql("SELECT * FROM skewData1 join skewData2 ON key1 = key2")
+
+    assert_gpu_and_cpu_are_equal_collect(do_join, conf=_adaptive_conf)
+
+# Test the computeStats(...) implementation in GpuDataSourceScanExec
+@ignore_order(local=True)
+@pytest.mark.parametrize("data_gen", integral_gens, ids=idfn)
+def test_aqe_join_parquet(spark_tmp_path, data_gen):
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    with_cpu_session(
+        lambda spark: unary_op_df(spark, data_gen).orderBy('a').write.parquet(data_path)
+    )
+
+    def do_it(spark):
+        spark.read.parquet(data_path).createOrReplaceTempView('df1')
+        spark.read.parquet(data_path).createOrReplaceTempView('df2')
+        return spark.sql("select count(*) from df1,df2 where df1.a = df2.a")
+
+    assert_gpu_and_cpu_are_equal_collect(do_it, conf=_adaptive_conf)
+
+
+# Test the computeStats(...) implementation in GpuBatchScanExec
+@ignore_order(local=True)
+@pytest.mark.parametrize("data_gen", integral_gens, ids=idfn)
+def test_aqe_join_parquet_batch(spark_tmp_path, data_gen):
+    # force v2 source for parquet to use BatchScanExec
+    conf = copy_and_update(_adaptive_conf, {
+        "spark.sql.sources.useV1SourceList": ""
+    })
+
+    first_data_path = spark_tmp_path + '/PARQUET_DATA/key=0'
+    with_cpu_session(
+            lambda spark : unary_op_df(spark, data_gen).write.parquet(first_data_path))
+    second_data_path = spark_tmp_path + '/PARQUET_DATA/key=1'
+    with_cpu_session(
+            lambda spark : unary_op_df(spark, data_gen).write.parquet(second_data_path))
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+
+    def do_it(spark):
+        spark.read.parquet(data_path).createOrReplaceTempView('df1')
+        spark.read.parquet(data_path).createOrReplaceTempView('df2')
+        return spark.sql("select count(*) from df1,df2 where df1.a = df2.a")
+
+    assert_gpu_and_cpu_are_equal_collect(do_it, conf=conf)
+
+# Test the map stage submission handling for GpuShuffleExchangeExec
+@ignore_order(local=True)
+def test_aqe_struct_self_join(spark_tmp_table_factory):
+    def do_join(spark):
+        data = [
+            (("Adam ", "", "Green"), "1", "M", 1000),
+            (("Bob ", "Middle", "Green"), "2", "M", 2000),
+            (("Cathy ", "", "Green"), "3", "F", 3000)
+        ]
+        schema = (StructType()
+                  .add("name", StructType()
+                       .add("firstname", StringType())
+                       .add("middlename", StringType())
+                       .add("lastname", StringType()))
+                  .add("id", StringType())
+                  .add("gender", StringType())
+                  .add("salary", IntegerType()))
+        df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
+        df_name = spark_tmp_table_factory.get()
+        df.createOrReplaceTempView(df_name)
+        resultdf = spark.sql(
+            "select struct(name, struct(name.firstname, name.lastname) as newname)" +
+            " as col,name from " + df_name + " union" +
+            " select struct(name, struct(name.firstname, name.lastname) as newname) as col,name" +
+            " from " + df_name)
+        resultdf_name = spark_tmp_table_factory.get()
+        resultdf.createOrReplaceTempView(resultdf_name)
+        return spark.sql("select a.* from {} a, {} b where a.name=b.name".format(
+            resultdf_name, resultdf_name))
+    assert_gpu_and_cpu_are_equal_collect(do_join, conf=_adaptive_conf)
diff --git a/integration_tests/src/main/python/cache_test.py b/integration_tests/src/main/python/cache_test.py
@@ -333,3 +333,17 @@ def test_func(spark):
         df.cache().count()
         return df.selectExpr("b", "a")
     assert_gpu_and_cpu_are_equal_collect(test_func, enable_vectorized_conf)
+
+# For AQE, test the computeStats(...) implementation in GpuInMemoryTableScanExec
+# NOTE: this test is here because the necessary cache configuration is only 
+# available when this test file is used
+@ignore_order(local=True)
+@allow_non_gpu("ShuffleExchangeExec", "ColumnarToRowExec")
+@pytest.mark.parametrize("data_gen", integral_gens, ids=idfn)
+def test_aqe_cache_join(data_gen):
+    conf = {'spark.sql.adaptive.enabled': 'true'}
+    def do_it(spark):
+        df1 = unary_op_df(spark, data_gen).orderBy('a').cache()
+        df2 = df1.alias('df2')
+        return df1.join(df2, df1.a == df2.a, 'Outer')
+    assert_gpu_and_cpu_are_equal_collect(do_it, conf=conf)
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -144,9 +144,6 @@ def do_join(spark):
     assert_gpu_and_cpu_are_equal_collect(do_join)
 
 @ignore_order(local=True)
-@pytest.mark.skipif(is_databricks_runtime(),
-                    reason="Disabled for databricks because of lack of AQE support, and "
-                           "differences in BroadcastMode.transform")
 @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn)
 def test_right_broadcast_nested_loop_join_without_condition_empty_small_batch(join_type):
     def do_join(spark):
@@ -155,9 +152,6 @@ def do_join(spark):
     assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.sql.adaptive.enabled': 'true'})
 
 @ignore_order(local=True)
-@pytest.mark.skipif(is_databricks_runtime(),
-                    reason="Disabled for databricks because of lack of AQE support, and "
-                           "differences in BroadcastMode.transform")
 @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn)
 def test_empty_broadcast_hash_join(join_type):
     def do_join(spark):

diff --git a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala b/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims
+
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.execution.LeafExecNode
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase
+
+trait ShimLeafExecNode extends LeafExecNode {
+  // For AQE support in Databricks, all Exec nodes implement computeStats(). This is actually
+  // a recursive call to traverse the entire physical plan to aggregate this number. For the
+  // end of the computation, this means that all LeafExecNodes must implement this method to
+  // avoid a stack overflow. For now, based on feedback from Databricks, Long.MaxValue is 
+  // sufficient to satisfy this computation.
+  override def computeStats(): Statistics = {
+    Statistics(
+      sizeInBytes = Long.MaxValue
+    )
+  }
+}
+
+// DataSourceV2ScanExecBase actually extends LeafExecNode, so we extend that shim as well here.
+trait ShimDataSourceV2ScanExecBase extends DataSourceV2ScanExecBase {
+  override def computeStats(): Statistics = {
+    Statistics(
+      sizeInBytes = Long.MaxValue
+    )
+  }
+}
diff --git a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala b/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims
+
+import org.apache.spark.sql.execution.LeafExecNode
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase
+
+trait ShimLeafExecNode extends LeafExecNode
+
+trait ShimDataSourceV2ScanExecBase extends DataSourceV2ScanExecBase
diff --git a/...lugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/...lugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -23,11 +23,10 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.connector.read._
-import org.apache.spark.sql.execution.datasources.v2._
 
 case class GpuBatchScanExec(
     output: Seq[AttributeReference],
-    @transient scan: Scan) extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+    @transient scan: Scan) extends ShimDataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
   @transient lazy val batch: Batch = scan.toBatch
 
   @transient override lazy val partitions: Seq[InputPartition] = batch.planInputPartitions()

diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
@@ -28,6 +28,5 @@ object AQEUtils {
     ShuffleQueryStageExec(sqse.id, reusedExchange, sqse.originalPlan)
   }
 
-  // currently we don't support AQE on Databricks
-  def isAdaptiveExecutionSupportedInSparkVersion: Boolean = false
+  def isAdaptiveExecutionSupportedInSparkVersion: Boolean = true
 }
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
@@ -39,7 +39,8 @@ case class GpuShuffleExchangeExec(
   override val outputPartitioning: Partitioning = cpuOutputPartitioning
 
   // 'mapOutputStatisticsFuture' is only needed when enable AQE.
-  override def doMapOutputStatisticsFuture: Future[MapOutputStatistics] = {
+  @transient
+  override lazy val doMapOutputStatisticsFuture: Future[MapOutputStatistics] = {
     if (inputBatchRDD.getNumPartitions == 0) {
       Future.successful(null)
     } else {

diff --git a/...lugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/...lugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -34,7 +34,7 @@ case class GpuBatchScanExec(
     output: Seq[AttributeReference],
     @transient scan: Scan,
     runtimeFilters: Seq[Expression] = Seq.empty)
-    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+    extends ShimDataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
   @transient lazy val batch: Batch = scan.toBatch
 
   // All expressions are filter expressions used on the CPU.

diff --git a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
@@ -28,6 +28,5 @@ object AQEUtils {
     ShuffleQueryStageExec(sqse.id, reusedExchange, sqse.originalPlan, sqse.isSparkExchange)
   }
 
-  // currently we don't support AQE on Databricks
-  def isAdaptiveExecutionSupportedInSparkVersion: Boolean = false
+  def isAdaptiveExecutionSupportedInSparkVersion: Boolean = true
 }
diff --git a/sql-plugin/src/main/321db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/321db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
@@ -15,37 +15,27 @@
  */
 package org.apache.spark.rapids.shims
 
-import scala.concurrent.Future
-
 import com.nvidia.spark.rapids.GpuPartitioning
 
-import org.apache.spark.MapOutputStatistics
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{ShufflePartitionSpec, SparkPlan}
 import org.apache.spark.sql.execution.exchange.{ShuffleExchangeLike, ShuffleOrigin}
-import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBaseWithMetrics, ShuffledBatchRDD}
 
 case class GpuShuffleExchangeExec(
     gpuOutputPartitioning: GpuPartitioning,
     child: SparkPlan,
     shuffleOrigin: ShuffleOrigin)(
     cpuOutputPartitioning: Partitioning)
-  extends GpuShuffleExchangeExecBase(gpuOutputPartitioning, child) with ShuffleExchangeLike {
+  extends GpuShuffleExchangeExecBaseWithMetrics(gpuOutputPartitioning, child)
+      with ShuffleExchangeLike {
 
   override def otherCopyArgs: Seq[AnyRef] = cpuOutputPartitioning :: Nil
 
   override val outputPartitioning: Partitioning = cpuOutputPartitioning
 
-  // 'mapOutputStatisticsFuture' is only needed when enable AQE.
-  override def mapOutputStatisticsFuture: Future[MapOutputStatistics] =
-    if (inputBatchRDD.getNumPartitions == 0) {
-      Future.successful(null)
-    } else {
-      sparkContext.submitMapStage(shuffleDependencyColumnar)
-    }
-
   override def numMappers: Int = shuffleDependencyColumnar.rdd.getNumPartitions
 
   override def numPartitions: Int = shuffleDependencyColumnar.partitioner.numPartitions

diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -36,7 +36,7 @@ case class GpuBatchScanExec(
     @transient scan: Scan,
     runtimeFilters: Seq[Expression] = Seq.empty,
     keyGroupedPartitioning: Option[Seq[Expression]] = None)
-    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+    extends ShimDataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
   @transient lazy val batch: Batch = scan.toBatch
 
   // All expressions are filter expressions used on the CPU.

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
@@ -23,15 +23,15 @@ import ai.rapids.cudf
 import ai.rapids.cudf._
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
-import com.nvidia.spark.rapids.shims.{ShimSparkPlan, ShimUnaryExecNode}
+import com.nvidia.spark.rapids.shims.{ShimLeafExecNode, ShimSparkPlan, ShimUnaryExecNode}
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeReference, Descending, Expression, NamedExpression, NullIntolerant, SortOrder}
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, RangePartitioning, SinglePartition, UnknownPartitioning}
-import org.apache.spark.sql.execution.{LeafExecNode, ProjectExec, SampleExec, SparkPlan}
+import org.apache.spark.sql.execution.{ProjectExec, SampleExec, SparkPlan}
 import org.apache.spark.sql.rapids.{GpuPartitionwiseSampledRDD, GpuPoissonSampler, GpuPredicateHelper}
 import org.apache.spark.sql.rapids.execution.TrampolineUtil
 import org.apache.spark.sql.types.{DataType, LongType}
@@ -591,7 +591,7 @@ case class GpuRangeExec(
     numSlices: Int,
     output: Seq[Attribute],
     targetSizeBytes: Long)
-    extends LeafExecNode with GpuExec {
+    extends ShimLeafExecNode with GpuExec {
 
   val numElements: BigInt = {
     val safeStart = BigInt(start)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuDataSourceScanExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuDataSourceScanExec.scala
@@ -17,18 +17,19 @@
 package org.apache.spark.sql.rapids
 
 import com.nvidia.spark.rapids.GpuExec
+import com.nvidia.spark.rapids.shims.ShimLeafExecNode
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.execution.{ExplainUtils, LeafExecNode}
+import org.apache.spark.sql.execution.ExplainUtils
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.util.Utils
 
 /** GPU implementation of Spark's `DataSourceScanExec` */
-trait GpuDataSourceScanExec extends LeafExecNode with GpuExec {
+trait GpuDataSourceScanExec extends ShimLeafExecNode with GpuExec {
   def relation: BaseRelation
   def tableIdentifier: Option[TableIdentifier]