NVIDIA · sperlingxx · Feb 22, 2021 · Feb 23, 2021 · Mar 1, 2021 · Mar 1, 2021
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala
@@ -18,8 +18,10 @@ package org.apache.spark.sql.rapids
 
 import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
+import scala.collection.mutable
+
 import ai.rapids.cudf.{JCudfSerialization, NvtxColor, NvtxRange, Table}
-import com.nvidia.spark.rapids.{Arm, GpuBindReferences, GpuBuildLeft, GpuColumnVector, GpuExec, GpuExpression, GpuMetric, GpuSemaphore, MetricsLevel}
+import com.nvidia.spark.rapids.{Arm, GpuBindReferences, GpuBuildLeft, GpuColumnVector, GpuExec, GpuExpression, GpuMetric, GpuSemaphore, MetricsLevel, SpillableColumnarBatch, SpillPriorities}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 
 import org.apache.spark.{Dependency, NarrowDependency, Partition, SparkContext, TaskContext}
@@ -141,15 +143,32 @@ class GpuCartesianRDD(
   override def compute(split: Partition, context: TaskContext):
   Iterator[ColumnarBatch] = {
     val currSplit = split.asInstanceOf[GpuCartesianPartition]
-    rdd1.iterator(currSplit.s1, context).flatMap { lhs =>
+
+    // create a buffer to cache stream-side data in a spillable manner
+    val spillBatchBuffer = mutable.ArrayBuffer[SpillableColumnarBatch]()
+
+    rdd1.iterator(currSplit.s1, context).zipWithIndex.flatMap { case (lhs, index) =>
       val table = withResource(lhs) { lhs =>
         GpuColumnVector.from(lhs.getBatch)
       }
-      // Ideally instead of looping through and recomputing rdd2 for
-      // each batch in rdd1 we would instead cache rdd2 in a way that
-      // it could spill to disk so we can avoid re-computation
+
+      val streamIterator = if (index == 0) {
+        // lazily compute and cache stream-side data
+        rdd2.iterator(currSplit.s2, context).map { serializableBatch =>
+          closeOnExcept(spillBatchBuffer) { buffer =>
+            val batch = SpillableColumnarBatch(
+              serializableBatch.getBatch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+            buffer += batch
+            batch.getColumnarBatch()
+          }
+        }
+      } else {
+        // fetch stream-side data directly if they are cached
+        spillBatchBuffer.toIterator.map(_.getColumnarBatch())
+      }
+
       val ret = GpuBroadcastNestedLoopJoinExecBase.innerLikeJoin(
-        rdd2.iterator(currSplit.s2, context).map(i => i.getBatch),
+        streamIterator,
         table,
         GpuBuildLeft,
         boundCondition,
@@ -161,7 +180,10 @@ class GpuCartesianRDD(
         filterTime,
         totalTime)
 
-      CompletionIterator[ColumnarBatch, Iterator[ColumnarBatch]](ret, table.close())
+      CompletionIterator[ColumnarBatch, Iterator[ColumnarBatch]](ret, {
+        table.close()
+        spillBatchBuffer.safeClose()
+      })
     }
   }