NVIDIA · gerashegalov · Mar 9, 2022 · Feb 9, 2022 · Feb 9, 2022 · Feb 15, 2022
diff --git a/docs/configs.md b/docs/configs.md
@@ -98,6 +98,7 @@ Name | Description | Default Value
 <a name="sql.incompatibleDateFormats.enabled"></a>spark.rapids.sql.incompatibleDateFormats.enabled|When parsing strings as dates and timestamps in functions like unix_timestamp, some formats are fully supported on the GPU and some are unsupported and will fall back to the CPU.  Some formats behave differently on the GPU than the CPU.  Spark on the CPU interprets date formats with unsupported trailing characters as nulls, while Spark on the GPU will parse the date with invalid trailing characters. More detail can be found at [parsing strings as dates or timestamps](compatibility.md#parsing-strings-as-dates-or-timestamps).|false
 <a name="sql.incompatibleOps.enabled"></a>spark.rapids.sql.incompatibleOps.enabled|For operations that work, but are not 100% compatible with the Spark equivalent set if they should be enabled by default or disabled by default.|false
 <a name="sql.join.cross.enabled"></a>spark.rapids.sql.join.cross.enabled|When set to true cross joins are enabled on the GPU|true
+<a name="sql.join.existence.enabled"></a>spark.rapids.sql.join.existence.enabled|When set to true existence joins are enabled on the GPU|true
 <a name="sql.join.fullOuter.enabled"></a>spark.rapids.sql.join.fullOuter.enabled|When set to true full outer joins are enabled on the GPU|true
 <a name="sql.join.inner.enabled"></a>spark.rapids.sql.join.inner.enabled|When set to true inner joins are enabled on the GPU|true
 <a name="sql.join.leftAnti.enabled"></a>spark.rapids.sql.join.leftAnti.enabled|When set to true left anti joins are enabled on the GPU|true

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -779,27 +779,55 @@ def do_join(spark):
 # If the condition is something like an AND, it makes the result a subset of a SemiJoin, and
 # the optimizer won't use ExistenceJoin.
 @ignore_order(local=True)
-@pytest.mark.parametrize(
-    "allowFallback", [
-        pytest.param('true',
-            marks=pytest.mark.allow_non_gpu('SortMergeJoinExec')),
-        pytest.param('false',
-            marks=pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/589"))
-    ], ids=idfn
-)
-def test_existence_join(allowFallback, spark_tmp_table_factory):
+@pytest.mark.parametrize('numComplementsToExists', [0, 1, 2], ids=(lambda val: f"complements:{val}") )
+@pytest.mark.parametrize('aqeEnabled', [
+    pytest.param(False, id='aqe:off'),
+    # workaround: somehow AQE retains RDDScanExec preventing parent ShuffleExchangeExec
+    # from being executed on GPU
+    pytest.param(True, marks=pytest.mark.allow_non_gpu('ShuffleExchangeExec'), id='aqe:on')
+])
+@pytest.mark.parametrize('conditionalJoin', [False, True], ids=['ast:off', 'ast:on'])
+def test_existence_join(numComplementsToExists, aqeEnabled, conditionalJoin, spark_tmp_table_factory):
     leftTable = spark_tmp_table_factory.get()
     rightTable = spark_tmp_table_factory.get()
     def do_join(spark):
         # create non-overlapping ranges to have a mix of exists=true and exists=false
-        spark.createDataFrame([v] for v in range(2, 10)).createOrReplaceTempView(leftTable)
-        spark.createDataFrame([v] for v in range(0, 8)).createOrReplaceTempView(rightTable)
+
+        # left-hand side rows
+        lhs_upper_bound = 10
+        lhs_data = list((f"left_{v}", v * 10, v * 100) for v in range(2, lhs_upper_bound))
+        # duplicate without a match
+        lhs_data.append(('left_1', 10, 100))
+        # duplicate with a match
+        lhs_data.append(('left_2', 20, 200))
+        lhs_data.append(('left_null', None, None))
+        df_left = spark.createDataFrame(lhs_data)
+        df_left.createOrReplaceTempView(leftTable)
+
+        rhs_data = list((f"right_{v}", v * 10, v * 100) for v in range(0, 8))
+        rhs_data.append(('right_null', None, None))
+        # duplicate every row in the rhs to verify it does not affect
+        # the number of the output rows, which should be equal to the number of the
+        rhs_data_with_dupes=[]
+        for dupe in rhs_data:
+            rhs_data_with_dupes.extend([dupe, dupe])
+
+        df_right = spark.createDataFrame(rhs_data_with_dupes)
+        df_right.createOrReplaceTempView(rightTable)
+        cond = "<=" if conditionalJoin else "="
         res = spark.sql((
             "select * "
             "from {} as l "
-            "where l._1 < 0 "
-            "   OR l._1 in (select * from {} as r)"
-        ).format(leftTable, rightTable))
+            f"where l._2 >= {10 * (lhs_upper_bound - numComplementsToExists)}"
+            "   or exists (select * from {} as r where r._2 = l._2 AND r._3 {} l._3)"
+        ).format(leftTable, rightTable, cond))
         return res
-    assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, r".+Join ExistenceJoin\(exists#[0-9]+\).+")
-
+    if conditionalJoin:
+        existenceJoinRegex = r"ExistenceJoin\(exists#[0-9]+\), \(.+ <= .+\)"
+    else:
+        existenceJoinRegex = r"ExistenceJoin\(exists#[0-9]+\)"
+
+    assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, existenceJoinRegex,
+        conf={
+            "spark.sql.adaptive.enabled": aqeEnabled,
+        })
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -417,8 +417,9 @@ object ExecutionPlanCaptureCallback {
     case p if p.expressions.exists(containsExpression(_, className, regexMap)) =>
       true
     case p: SparkPlan =>
+      val sparkPlanStringForRegex = p.verboseStringWithSuffix(1000)
       regexMap.getOrElseUpdate(className, className.r)
-        .findFirstIn(p.simpleStringWithNodeId())
+        .findFirstIn(sparkPlanStringForRegex)
         .nonEmpty
   }.nonEmpty
 }

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -697,6 +697,11 @@ object RapidsConf {
       .booleanConf
       .createWithDefault(true)
 
+  val ENABLE_EXISTENCE_JOIN = conf("spark.rapids.sql.join.existence.enabled")
+      .doc("When set to true existence joins are enabled on the GPU")
+      .booleanConf
+      .createWithDefault(true)
+
   val ENABLE_PROJECT_AST = conf("spark.rapids.sql.projectAstEnabled")
       .doc("Enable project operations to use cudf AST expressions when possible.")
       .internal()
@@ -1562,6 +1567,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val areLeftAntiJoinsEnabled: Boolean = get(ENABLE_LEFT_ANTI_JOIN)
 
+  lazy val areExistenceJoinsEnabled: Boolean = get(ENABLE_EXISTENCE_JOIN)
+
   lazy val isCastDecimalToFloatEnabled: Boolean = get(ENABLE_CAST_DECIMAL_TO_FLOAT)
 
   lazy val isCastFloatToDecimalEnabled: Boolean = get(ENABLE_CAST_FLOAT_TO_DECIMAL)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala
@@ -15,9 +15,12 @@
  */
 package org.apache.spark.sql.rapids.execution
 
-import ai.rapids.cudf.{DType, GroupByAggregation, NullEquality, NullPolicy, NvtxColor, ReductionAggregation, Table}
+import scala.collection.mutable.ArrayBuffer
+
+import ai.rapids.cudf.{ColumnVector, DType, GatherMap, GroupByAggregation, NullEquality, NullPolicy, NvtxColor, ReductionAggregation, Scalar, Table}
 import ai.rapids.cudf.ast.CompiledExpression
 import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.plans.{Cross, ExistenceJoin, FullOuter, Inner, InnerLike, JoinType, LeftAnti, LeftExistence, LeftOuter, LeftSemi, RightOuter}
@@ -50,6 +53,9 @@ object JoinTypeChecks {
       case LeftAnti if !conf.areLeftAntiJoinsEnabled =>
         meta.willNotWorkOnGpu("left anti joins have been disabled. To enable set " +
             s"${RapidsConf.ENABLE_LEFT_ANTI_JOIN.key} to true")
+      case ExistenceJoin(_) if !conf.areExistenceJoinsEnabled =>
+        meta.willNotWorkOnGpu("existence joins have been disabled. To enable set " +
+            s"${RapidsConf.ENABLE_EXISTENCE_JOIN.key} to true")
       case _ => // not disabled
     }
   }
@@ -107,7 +113,7 @@ object GpuHashJoin extends Arm {
     JoinTypeChecks.tagForGpu(joinType, meta)
     joinType match {
       case _: InnerLike =>
-      case RightOuter | LeftOuter | LeftSemi | LeftAnti =>
+      case RightOuter | LeftOuter | LeftSemi | LeftAnti | ExistenceJoin(_) =>
         conditionMeta.foreach(meta.requireAstForGpuOn)
       case FullOuter =>
         conditionMeta.foreach(meta.requireAstForGpuOn)
@@ -622,7 +628,114 @@ trait GpuHashJoin extends GpuExec {
     (joinLeft.output.size, boundCondition)
   }
 
-  def doJoin(
+  private def existenceJoinIterator(
+    builtBatch: ColumnarBatch,
+    stream: Iterator[ColumnarBatch]) = new Iterator[ColumnarBatch]()
+      with AutoCloseable with Arm {
+    var closed: Boolean = false
+
+    // iteration-independent resources
+    val resources = ArrayBuffer[AutoCloseable]()
+    def use[T <: AutoCloseable](ac: T): T = {
+      resources += ac
+      ac
+    }
+
+    val compiledConditionRes: Option[(Table, CompiledExpression)] = boundCondition.map(gpuExpr => (
+        use(GpuColumnVector.from(builtBatch)),
+        use(gpuExpr.convertToAst(numFirstConditionTableColumns).compile())
+      )
+    )
+
+    val rightKeysTab = use(
+      withResource(GpuProjectExec.project(builtBatch, boundBuildKeys))(GpuColumnVector.from(_)))
+
+    val falseScalar = use(Scalar.fromBool(false))
+    val trueScalar = use(Scalar.fromBool(true))
+    override def hasNext: Boolean = {
+      val streamHasNext = stream.hasNext
+      if (!streamHasNext) {
+        close()
+      }
+      streamHasNext
+    }
+
+    override def next(): ColumnarBatch = {
+      try {
+        withResource(stream.next()) { leftColumnarBatch =>
+          existenceJoinNextBatch(leftColumnarBatch)
+        }
+      } catch {
+        case t: Throwable =>
+          close()
+          throw t
+      }
+    }
+
+    override def close() = if (!closed) {
+      closed = true
+      val resourcesReversed = resources.reverse
+      resourcesReversed.safeClose()
+    }
+
+    private def leftKeysTable(leftColumnarBatch: ColumnarBatch): Table = {
+      withResource(GpuProjectExec.project(leftColumnarBatch, boundStreamKeys))(
+        leftKeys => GpuColumnVector.from(leftKeys))
+    }
+
+    private def conditionalBatchLeftSemiJoin(
+      leftColumnarBatch: ColumnarBatch,
+      rightTab: Table,
+      leftfKeysTab: Table,
+      compiledCondition: CompiledExpression): GatherMap = {
+      withResource(GpuColumnVector.from(leftColumnarBatch))(leftTab =>
+        Table.mixedLeftSemiJoinGatherMap(
+          leftfKeysTab,
+          rightKeysTab,
+          leftTab,
+          rightTab,
+          compiledCondition,
+          if (compareNullsEqual) NullEquality.EQUAL else NullEquality.UNEQUAL))
+    }
+
+    private def existsScatterMap(leftColumnarBatch: ColumnarBatch): GatherMap = {
+      withResource(leftKeysTable(leftColumnarBatch))(leftKeysTab =>
+        compiledConditionRes.map { case (rightTab, compiledCondition) =>
+          conditionalBatchLeftSemiJoin(leftColumnarBatch, rightTab, leftKeysTab, compiledCondition)
+        }.getOrElse {
+          leftKeysTab.leftSemiJoinGatherMap(rightKeysTab, compareNullsEqual)
+        })
+    }
+
+    private def falseColumnTable(numLeftRows: Int): Table = {
+      withResource(ColumnVector.fromScalar(falseScalar, numLeftRows))(
+        new Table(_)
+      )
+    }
+
+    private def existsTable(leftColumnarBatch: ColumnarBatch): Table = {
+      withResource(existsScatterMap(leftColumnarBatch)) { existsScatterMap =>
+        val numLeftRows = leftColumnarBatch.numRows
+        withResource(falseColumnTable(numLeftRows)) { allFalseTable =>
+          val numExistsTrueRows = existsScatterMap.getRowCount.toInt
+          withResource(existsScatterMap.toColumnView(0, numExistsTrueRows)) { existsView =>
+            Table.scatter(Array(trueScalar), existsView, allFalseTable, false)
+          }
+        }
+      }
+    }
+
+    private def existenceJoinNextBatch(leftColumnarBatch: ColumnarBatch): ColumnarBatch = {
+      // left columns with exists
+      withResource(existsTable(leftColumnarBatch)) { existsTable =>
+        val resCols = GpuColumnVector.extractBases(leftColumnarBatch)  :+ existsTable.getColumn(0)
+        val resTypes = GpuColumnVector.extractTypes(leftColumnarBatch) :+ BooleanType
+        withResource(new Table(resCols: _*))(resTab => GpuColumnVector.from(resTab, resTypes))
+      }
+    }
+  }
+
+  private def hashJoinLikeIterator(
       builtBatch: ColumnarBatch,
       stream: Iterator[ColumnarBatch],
       targetSize: Long,
@@ -657,7 +770,7 @@ trait GpuHashJoin extends GpuExec {
 
     // The HashJoinIterator takes ownership of the built keys and built data. It will close
     // them when it is done
-    val joinIterator = if (boundCondition.isDefined) {
+    if (boundCondition.isDefined) {
       // ConditionalHashJoinIterator will close the compiled condition
       val compiledCondition =
         boundCondition.get.convertToAst(numFirstConditionTableColumns).compile()
@@ -669,6 +782,27 @@ trait GpuHashJoin extends GpuExec {
         streamedPlan.output, realTarget, joinType, buildSide, compareNullsEqual, spillCallback,
         opTime, joinTime)
     }
+  }
+
+  def doJoin(
+      builtBatch: ColumnarBatch,
+      stream: Iterator[ColumnarBatch],
+      targetSize: Long,
+      spillCallback: SpillCallback,
+      numOutputRows: GpuMetric,
+      joinOutputRows: GpuMetric,
+      numOutputBatches: GpuMetric,
+      opTime: GpuMetric,
+      joinTime: GpuMetric): Iterator[ColumnarBatch] = {
+
+    // The HashJoinIterator takes ownership of the built keys and built data. It will close
+    // them when it is done
+    val joinIterator = if (joinType.isInstanceOf[ExistenceJoin]) {
+      existenceJoinIterator(builtBatch, stream)
+    } else {
+      hashJoinLikeIterator(builtBatch, stream, targetSize, spillCallback, numOutputRows,
+        joinOutputRows, numOutputBatches, opTime, joinTime)
+    }
 
     joinIterator.map { cb =>
       joinOutputRows += cb.numRows()