NVIDIA · tgravescs · Jun 6, 2022 · Jun 6, 2022 · Jun 6, 2022
diff --git a/scripts/rundiffspark2.sh b/scripts/rundiffspark2.sh
diff --git a/scripts/spark2diffs/ArrowEvalPythonExec.diff b/scripts/spark2diffs/ArrowEvalPythonExec.diff
diff --git a/scripts/spark2diffs/CastExprMeta.diff b/scripts/spark2diffs/CastExprMeta.diff
@@ -10,7 +10,7 @@
 <   // 2.x doesn't have the SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING config, so set it to true
 <   val legacyCastToString: Boolean = true
 ---
->   val legacyCastToString: Boolean = SparkShimImpl.getLegacyComplexTypeToString()
+>   val legacyCastToString: Boolean = SQLConf.get.getConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING)
 46c45
 <         if (dt.precision > GpuOverrides.DECIMAL128_MAX_PRECISION) {
 ---

diff --git a/scripts/spark2diffs/GpuCSVScan.diff b/scripts/spark2diffs/GpuCSVScan.diff
@@ -1,13 +1,4 @@
-27,34c27,34
-<   def dateFormatInRead(csvOpts: CSVOptions): Option[String] = {
-<     // spark 2.x uses FastDateFormat, use getPattern
-<     Option(csvOpts.dateFormat.getPattern)
-<   }
-< 
-<   def timestampFormatInRead(csvOpts: CSVOptions): Option[String] = {
-<     // spark 2.x uses FastDateFormat, use getPattern
-<     Option(csvOpts.timestampFormat.getPattern)
----
+1a2,11
 >   def tagSupport(scanMeta: ScanMeta[CSVScan]) : Unit = {
 >     val scan = scanMeta.wrapped
 >     tagSupport(
@@ -16,35 +7,32 @@
 >       scan.readDataSchema,
 >       scan.options.asScala.toMap,
 >       scanMeta)
-42c42
+>   }
+> 
+7c17
 <       meta: RapidsMeta[_, _]): Unit = {
 ---
 >       meta: RapidsMeta[_, _, _]): Unit = {
-67,68d66
+32,33d41
 <     // 2.x only supports delimiter as char
 <     /*
-72d69
+37d44
 <     */
-74,75c71
+39,40c46
 <     // delimiter is char in 2.x
 <     if (parsedOptions.delimiter > 127) {
 ---
 >     if (parsedOptions.delimiter.codePointAt(0) > 127) {
-105,109d100
+70,74d75
 <     // 2.x doesn't have linSeparator config
 <     // CSV text with '\n', '\r' and '\r\n' as line separators.
 <     // Since I have no way to check in 2.x we will just assume it works for explain until
 <     // they move to 3.x
 <     /*
-113d103
+78d78
 <     */
-154,156c144
-< 
+119,120c119
 <       // Spark 2.x doesn't have zoneId, so use timeZone and then to id
 <       if (!TypeChecks.areTimestampsSupported(parsedOptions.timeZone.toZoneId)) {
 ---
 >       if (!TypeChecks.areTimestampsSupported(parsedOptions.zoneId)) {
-159c147
-<       timestampFormatInRead(parsedOptions).foreach { tsFormat =>
----
->       FileOptionsShims.timestampFormatInRead(parsedOptions).foreach { tsFormat =>
diff --git a/scripts/spark2diffs/GpuCsvUtils.diff b/scripts/spark2diffs/GpuCsvUtils.diff
@@ -1,5 +1,7 @@
-2,3c2
+2,4c2,3
 <   // spark 2.x uses FastDateFormat, use getPattern
 <   def dateFormatInRead(options: CSVOptions): String = options.dateFormat.getPattern
+<   def timestampFormatInRead(options: CSVOptions): String = options.timestampFormat.getPattern
 ---
 >   def dateFormatInRead(options: CSVOptions): String = options.dateFormat
+>   def timestampFormatInRead(options: CSVOptions): String = options.timestampFormat
diff --git a/scripts/spark2diffs/GpuFileSourceScanExec.diff b/scripts/spark2diffs/GpuFileSourceScanExec.diff
@@ -1,16 +1,8 @@
-8,10c8,18
-<       // SPARK 2.x - We leave off Avro here since its a datasource v2 thing and off by default
-<       case f =>
-<         meta.willNotWorkOnGpu(s"unsupported file format: ${f.getClass.getCanonicalName}")
+8,9c8,9
+<       // SPARK 2.x - We leave off Avro and external sources here since its a datasource v2
+<       // thing and off by default
 ---
->       case _ => ExternalSource.tagSupportForGpuFileSourceScanExec(meta)
->     }
->   }
+>       case ef if ExternalSource.isSupportedFormat(ef) =>
+>         ExternalSource.tagSupportForGpuFileSourceScan(meta)
+13a14
 > 
->   def convertFileFormat(format: FileFormat): FileFormat = {
->     format match {
->       case _: CSVFileFormat => new GpuReadCSVFileFormat
->       case f if GpuOrcFileFormat.isSparkOrcFormat(f) => new GpuReadOrcFileFormat
->       case _: ParquetFileFormat => new GpuReadParquetFileFormat
->       case _: JsonFileFormat => new GpuReadJsonFileFormat
->       case _ => ExternalSource.convertFileFormatForGpuFileSourceScanExec(format)
diff --git a/scripts/spark2diffs/GpuGetArrayItemMeta.diff b/scripts/spark2diffs/GpuGetArrayItemMeta.diff
diff --git a/scripts/spark2diffs/GpuGetMapValueMeta.diff b/scripts/spark2diffs/GpuGetMapValueMeta.diff
diff --git a/scripts/spark2diffs/GpuJoinUtils.diff b/scripts/spark2diffs/GpuJoinUtils.diff
@@ -1,22 +1,6 @@
-16,18d15
-< package com.nvidia.spark.rapids.shims
-< 
-< import com.nvidia.spark.rapids.shims._
-20,26c17
+15a16
+> 
+18c19
 < import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
-< 
-< /**
-<  * Spark BuildSide, BuildRight, BuildLeft moved packages in Spark 3.1
-<  * so create GPU versions of these that can be agnostic to Spark version.
-<  */
-< sealed abstract class GpuBuildSide
----
-> package com.nvidia.spark.rapids.shims
-28c19
-< case object GpuBuildRight extends GpuBuildSide
----
-> import com.nvidia.spark.rapids.{GpuBuildLeft, GpuBuildRight, GpuBuildSide}
-30c21
-< case object GpuBuildLeft extends GpuBuildSide
 ---
 > import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide}
diff --git a/scripts/spark2diffs/GpuJsonScan.diff b/scripts/spark2diffs/GpuJsonScan.diff
@@ -1,15 +1,4 @@
-3,12d2
-<   // spark 2.x uses FastDateFormat, use getPattern
-<   def dateFormatInRead(options: JSONOptions): String = options.dateFormat.getPattern
-< 
-<   def timestampFormatInRead(fileOptions: Serializable): Option[String] = {
-<     fileOptions match {
-<       case jsonOpts: JSONOptions => Option(jsonOpts.timestampFormat.getPattern)
-<       case _ => throw new RuntimeException("Wrong file options.")
-<     }
-<   }
-< 
-37a28,37
+2a3,12
 >   def tagSupport(scanMeta: ScanMeta[JsonScan]) : Unit = {
 >     val scan = scanMeta.wrapped
 >     tagSupport(
@@ -20,20 +9,14 @@
 >       scanMeta)
 >   }
 > 
-43c43
+8c18
 <       meta: RapidsMeta[_, _]): Unit = {
 ---
 >       meta: RapidsMeta[_, _, _]): Unit = {
-106c106
-<           dateFormatInRead(parsedOptions), parseString = true)
----
->         GpuJsonUtils.dateFormatInRead(parsedOptions), parseString = true)
-110,111c110
+75,77c85,86
+<    if (types.contains(TimestampType)) {
 <       // Spark 2.x doesn't have zoneId, so use timeZone and then to id
 <       if (!TypeChecks.areTimestampsSupported(parsedOptions.timeZone.toZoneId)) {
 ---
+>     if (types.contains(TimestampType)) {
 >       if (!TypeChecks.areTimestampsSupported(parsedOptions.zoneId)) {
-114c113
-<       timestampFormatInRead(parsedOptions).foreach { tsFormat =>
----
->       FileOptionsShims.timestampFormatInRead(parsedOptions).foreach { tsFormat =>
diff --git a/scripts/spark2diffs/GpuJsonUtils.diff b/scripts/spark2diffs/GpuJsonUtils.diff
@@ -0,0 +1,7 @@
+2,4c2,3
+<   // spark 2.x uses FastDateFormat, use getPattern
+<   def dateFormatInRead(options: JSONOptions): String = options.dateFormat.getPattern
+<   def timestampFormatInRead(options: JSONOptions): String = options.timestampFormat.getPattern
+---
+>   def dateFormatInRead(options: JSONOptions): String = options.dateFormat
+>   def timestampFormatInRead(options: JSONOptions): String = options.timestampFormat
diff --git a/scripts/spark2diffs/GpuOrcScanBase.diff b/scripts/spark2diffs/GpuOrcScanBase.diff
@@ -1,4 +1,6 @@
-1a2,10
+2c2,9
+<   // spark 2.x doesn't have data source v2 so not ScanMeta
+---
 >   def tagSupport(scanMeta: ScanMeta[OrcScan]): Unit = {
 >     val scan = scanMeta.wrapped
 >     val schema = StructType(scan.readDataSchema ++ scan.readPartitionSchema)
@@ -7,8 +9,7 @@
 >     }
 >     tagSupport(scan.sparkSession, schema, scanMeta)
 >   }
-> 
-5c14
+7c14
 <       meta: RapidsMeta[_, _]): Unit = {
 ---
 >       meta: RapidsMeta[_, _, _]): Unit = {