NVIDIA · sperlingxx · Dec 17, 2021 · Nov 17, 2021 · Nov 18, 2021 · Nov 19, 2021
diff --git a/docs/configs.md b/docs/configs.md
@@ -351,6 +351,7 @@ Name | Description | Default Value | Notes
 <a name="sql.exec.RangeExec"></a>spark.rapids.sql.exec.RangeExec|The backend for range operator|true|None|
 <a name="sql.exec.SampleExec"></a>spark.rapids.sql.exec.SampleExec|The backend for the sample operator|true|None|
 <a name="sql.exec.SortExec"></a>spark.rapids.sql.exec.SortExec|The backend for the sort operator|true|None|
+<a name="sql.exec.SubqueryBroadcastExec"></a>spark.rapids.sql.exec.SubqueryBroadcastExec|Plan to collect and transform the broadcast key values|true|None|
 <a name="sql.exec.TakeOrderedAndProjectExec"></a>spark.rapids.sql.exec.TakeOrderedAndProjectExec|Take the first limit elements as defined by the sortOrder, and do projection if needed|true|None|
 <a name="sql.exec.UnionExec"></a>spark.rapids.sql.exec.UnionExec|The backend for the union operator|true|None|
 <a name="sql.exec.CustomShuffleReaderExec"></a>spark.rapids.sql.exec.CustomShuffleReaderExec|A wrapper of shuffle query stage|true|None|

diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -420,8 +420,8 @@ Accelerator supports are described below.
 <td><b>NS</b></td>
 </tr>
 <tr>
-<td rowspan="1">TakeOrderedAndProjectExec</td>
-<td rowspan="1">Take the first limit elements as defined by the sortOrder, and do projection if needed</td>
+<td rowspan="1">SubqueryBroadcastExec</td>
+<td rowspan="1">Plan to collect and transform the broadcast key values</td>
 <td rowspan="1">None</td>
 <td>Input/Output</td>
 <td>S</td>
@@ -436,16 +436,16 @@ Accelerator supports are described below.
 <td>S</td>
 <td>S</td>
 <td>S</td>
-<td><b>NS</b></td>
-<td><b>NS</b></td>
-<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
-<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
-<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
-<td><b>NS</b></td>
+<td>S</td>
+<td>S</td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP</em></td>
+<td>S</td>
 </tr>
 <tr>
-<td rowspan="1">UnionExec</td>
-<td rowspan="1">The backend for the union operator</td>
+<td rowspan="1">TakeOrderedAndProjectExec</td>
+<td rowspan="1">Take the first limit elements as defined by the sortOrder, and do projection if needed</td>
 <td rowspan="1">None</td>
 <td>Input/Output</td>
 <td>S</td>
@@ -464,12 +464,12 @@ Accelerator supports are described below.
 <td><b>NS</b></td>
 <td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
 <td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
-<td><em>PS<br/>unionByName will not optionally impute nulls for missing struct fields when the column is a struct and there are non-overlapping fields;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
 <td><b>NS</b></td>
 </tr>
 <tr>
-<td rowspan="1">CustomShuffleReaderExec</td>
-<td rowspan="1">A wrapper of shuffle query stage</td>
+<td rowspan="1">UnionExec</td>
+<td rowspan="1">The backend for the union operator</td>
 <td rowspan="1">None</td>
 <td>Input/Output</td>
 <td>S</td>
@@ -488,7 +488,7 @@ Accelerator supports are described below.
 <td><b>NS</b></td>
 <td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
 <td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
-<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
+<td><em>PS<br/>unionByName will not optionally impute nulls for missing struct fields when the column is a struct and there are non-overlapping fields;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
 <td><b>NS</b></td>
 </tr>
 <tr>
@@ -516,6 +516,30 @@ Accelerator supports are described below.
 <th>UDT</th>
 </tr>
 <tr>
+<td rowspan="1">CustomShuffleReaderExec</td>
+<td rowspan="1">A wrapper of shuffle query stage</td>
+<td rowspan="1">None</td>
+<td>Input/Output</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td><b>NS</b></td>
+<td><b>NS</b></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
+<td><b>NS</b></td>
+</tr>
+<tr>
 <td rowspan="1">HashAggregateExec</td>
 <td rowspan="1">The backend for hash based aggregations</td>
 <td rowspan="1">None</td>
@@ -840,6 +864,30 @@ Accelerator supports are described below.
 <td><b>NS</b></td>
 </tr>
 <tr>
+<th>Executor</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Param(s)</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowspan="4">ShuffledHashJoinExec</td>
 <td rowspan="4">Implementation of join using hashed shuffled data</td>
 <td rowspan="4">None</td>
@@ -927,30 +975,6 @@ Accelerator supports are described below.
 <td><b>NS</b></td>
 </tr>
 <tr>
-<th>Executor</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Param(s)</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowspan="4">SortMergeJoinExec</td>
 <td rowspan="4">Sort merge join, replacing with shuffled hash join</td>
 <td rowspan="4">None</td>

diff --git a/integration_tests/src/main/python/dpp_test.py b/integration_tests/src/main/python/dpp_test.py
@@ -15,7 +15,7 @@
 import pytest
 
 from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture
-from conftest import spark_tmp_table_factory
+from conftest import spark_tmp_table_factory, is_databricks_runtime
 from data_gen import *
 from marks import ignore_order
 from spark_session import is_before_spark_320, with_cpu_session
@@ -81,7 +81,7 @@ def fn(spark):
     ''',
     '''
     SELECT f.key, sum(f.value)
-    FROM (SELECT *, struct(key) AS keys FROM {0} fact) f 
+    FROM (SELECT *, struct(key) AS keys FROM {0} fact) f
     JOIN (SELECT *, struct(key) AS keys FROM {1} dim) d
     ON f.keys = d.keys
     WHERE d.filter = {2}
@@ -91,32 +91,64 @@ def fn(spark):
 
 
 # When BroadcastExchangeExec is available on filtering side, and it can be reused:
-# DynamicPruningExpression(InSubqueryExec(value, SubqueryBroadcastExec)))
+# DynamicPruningExpression(InSubqueryExec(value, GpuSubqueryBroadcastExec)))
 @ignore_order
-@pytest.mark.parametrize('aqe_on', ['true', 'false'], ids=idfn)
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
-@pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ AQE and DPP can be both enabled")
-def test_dpp_reuse_broadcast_exchange(aqe_on, store_format, s_index, spark_tmp_table_factory):
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
+def test_dpp_reuse_broadcast_exchange_aqe_off(store_format, s_index, spark_tmp_table_factory):
     fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
     create_fact_table(fact_table, store_format, length=10000)
     filter_val = create_dim_table(dim_table, store_format, length=2000)
     statement = _statements[s_index].format(fact_table, dim_table, filter_val)
     assert_cpu_and_gpu_are_equal_collect_with_capture(
         lambda spark: spark.sql(statement),
-        # SubqueryBroadcastExec appears if we reuse broadcast exchange for DPP
-        exist_classes='DynamicPruningExpression,SubqueryBroadcastExec',
-        conf=dict(_exchange_reuse_conf + [('spark.sql.adaptive.enabled', aqe_on)]))
+        # The existence of GpuSubqueryBroadcastExec indicates the reuse works on the GPU
+        exist_classes='DynamicPruningExpression,GpuSubqueryBroadcastExec,ReusedExchangeExec',
+        conf=dict(_exchange_reuse_conf + [('spark.sql.adaptive.enabled', 'false')]))
 
 
-# When BroadcastExchange is not available and non-broadcast DPPs are forbidden, Spark will bypass it:
-# DynamicPruningExpression(Literal.TrueLiteral)
+# The SubqueryBroadcast can work on GPU even if the scan who holds it fallbacks into CPU.
+@ignore_order
+@pytest.mark.allow_non_gpu('FileSourceScanExec')
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
+def test_dpp_reuse_broadcast_exchange_cpu_scan(spark_tmp_table_factory):
+    fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
+    create_fact_table(fact_table, 'parquet', length=10000)
+    filter_val = create_dim_table(dim_table, 'parquet', length=2000)
+    statement = _statements[0].format(fact_table, dim_table, filter_val)
+    assert_cpu_and_gpu_are_equal_collect_with_capture(
+        lambda spark: spark.sql(statement),
+        # The existence of GpuSubqueryBroadcastExec indicates the reuse works on the GPU
+        exist_classes='FileSourceScanExec,GpuSubqueryBroadcastExec,ReusedExchangeExec',
+        conf=dict(_exchange_reuse_conf + [
+            ('spark.sql.adaptive.enabled', 'false'),
+            ('spark.rapids.sql.format.parquet.read.enabled', 'false')]))
+
+
+# When AQE enabled, the broadcast exchange can not be reused in current, because spark-rapids
+# will plan GpuBroadcastToCpu for exchange reuse. Meanwhile, the original broadcast exchange is
+# simply replaced by GpuBroadcastExchange. Therefore, the reuse can not work since
+# GpuBroadcastToCpu is not semantically equal to GpuBroadcastExchange.
 @ignore_order
-@pytest.mark.parametrize('aqe_on', ['true', 'false'], ids=idfn)
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
 @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ AQE and DPP can be both enabled")
-def test_dpp_bypass(aqe_on, store_format, s_index, spark_tmp_table_factory):
+def test_dpp_reuse_broadcast_exchange_aqe_on(store_format, s_index, spark_tmp_table_factory):
+    fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
+    create_fact_table(fact_table, store_format, length=10000)
+    filter_val = create_dim_table(dim_table, store_format, length=2000)
+    statement = _statements[s_index].format(fact_table, dim_table, filter_val)
+    assert_cpu_and_gpu_are_equal_collect_with_capture(
+        lambda spark: spark.sql(statement),
+        exist_classes='DynamicPruningExpression,SubqueryBroadcastExec,GpuBroadcastToCpuExec',
+        conf=dict(_exchange_reuse_conf + [('spark.sql.adaptive.enabled', 'true')]))
+
+
+# When BroadcastExchange is not available and non-broadcast DPPs are forbidden, Spark will bypass it:
+# DynamicPruningExpression(Literal.TrueLiteral)
+def __dpp_bypass(store_format, s_index, spark_tmp_table_factory, aqe_enabled):
     fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
     create_fact_table(fact_table, store_format)
     filter_val = create_dim_table(dim_table, store_format)
@@ -126,18 +158,30 @@ def test_dpp_bypass(aqe_on, store_format, s_index, spark_tmp_table_factory):
         # Bypass with a true literal, if we can not reuse broadcast exchange.
         exist_classes='DynamicPruningExpression',
         non_exist_classes='SubqueryExec,SubqueryBroadcastExec',
-        conf=dict(_bypass_conf + [('spark.sql.adaptive.enabled', aqe_on)]))
+        conf=dict(_bypass_conf + [('spark.sql.adaptive.enabled', aqe_enabled)]))
+
+
+@ignore_order
+@pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
+@pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
+def test_dpp_bypass_aqe_off(store_format, s_index, spark_tmp_table_factory):
+    __dpp_bypass(store_format, s_index, spark_tmp_table_factory, 'false')
 
 
-# When BroadcastExchange is not available, but it is still worthwhile to run DPP,
-# then Spark will plan an extra Aggregate to collect filtering values:
-# DynamicPruningExpression(InSubqueryExec(value, SubqueryExec(Aggregate(...))))
 @ignore_order
-@pytest.mark.parametrize('aqe_on', ['true', 'false'], ids=idfn)
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
 @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ AQE and DPP can be both enabled")
-def test_dpp_via_aggregate_subquery(aqe_on, store_format, s_index, spark_tmp_table_factory):
+def test_dpp_bypass_aqe_on(store_format, s_index, spark_tmp_table_factory):
+    __dpp_bypass(store_format, s_index, spark_tmp_table_factory, 'true')
+
+
+# When BroadcastExchange is not available, but it is still worthwhile to run DPP,
+# then Spark will plan an extra Aggregate to collect filtering values:
+# DynamicPruningExpression(InSubqueryExec(value, SubqueryExec(Aggregate(...))))
+def __dpp_via_aggregate_subquery(store_format, s_index, spark_tmp_table_factory, aqe_enabled):
     fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
     create_fact_table(fact_table, store_format)
     filter_val = create_dim_table(dim_table, store_format)
@@ -146,16 +190,28 @@ def test_dpp_via_aggregate_subquery(aqe_on, store_format, s_index, spark_tmp_tab
         lambda spark: spark.sql(statement),
         # SubqueryExec appears if we plan extra subquery for DPP
         exist_classes='DynamicPruningExpression,SubqueryExec',
-        conf=dict(_no_exchange_reuse_conf + [('spark.sql.adaptive.enabled', aqe_on)]))
+        conf=dict(_no_exchange_reuse_conf + [('spark.sql.adaptive.enabled', aqe_enabled)]))
 
 
-# When BroadcastExchange is not available, Spark will skip DPP if there is no potential benefit
 @ignore_order
-@pytest.mark.parametrize('aqe_on', ['true', 'false'], ids=idfn)
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
+def test_dpp_via_aggregate_subquery_aqe_off(store_format, s_index, spark_tmp_table_factory):
+    __dpp_via_aggregate_subquery(store_format, s_index, spark_tmp_table_factory, 'false')
+
+
+@ignore_order
+@pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
+@pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
 @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ AQE and DPP can be both enabled")
-def test_dpp_skip(aqe_on, store_format, s_index, spark_tmp_table_factory):
+def test_dpp_via_aggregate_subquery_aqe_on(store_format, s_index, spark_tmp_table_factory):
+    __dpp_via_aggregate_subquery(store_format, s_index, spark_tmp_table_factory, 'true')
+
+
+# When BroadcastExchange is not available, Spark will skip DPP if there is no potential benefit
+def __dpp_skip(store_format, s_index, spark_tmp_table_factory, aqe_enabled):
     fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
     create_fact_table(fact_table, store_format)
     filter_val = create_dim_table(dim_table, store_format)
@@ -164,4 +220,21 @@ def test_dpp_skip(aqe_on, store_format, s_index, spark_tmp_table_factory):
         lambda spark: spark.sql(statement),
         # SubqueryExec appears if we plan extra subquery for DPP
         non_exist_classes='DynamicPruningExpression',
-        conf=dict(_dpp_fallback_conf + [('spark.sql.adaptive.enabled', aqe_on)]))
+        conf=dict(_dpp_fallback_conf + [('spark.sql.adaptive.enabled', aqe_enabled)]))
+
+
+@ignore_order
+@pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
+@pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
+def test_dpp_skip_aqe_off(store_format, s_index, spark_tmp_table_factory):
+    __dpp_skip(store_format, s_index, spark_tmp_table_factory, 'false')
+
+
+@ignore_order
+@pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
+@pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.skipif(is_databricks_runtime(), reason="DPP can not cooperate with rapids plugin on Databricks runtime")
+@pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ AQE and DPP can be both enabled")
+def test_dpp_skip_aqe_on(store_format, s_index, spark_tmp_table_factory):
+    __dpp_skip(store_format, s_index, spark_tmp_table_factory, 'true')