NVIDIA · res-life · Jun 24, 2022 · Jun 2, 2022 · Jun 7, 2022 · Jun 7, 2022
diff --git a/dist/unshimmed-from-each-spark3xx.txt b/dist/unshimmed-from-each-spark3xx.txt
@@ -1,3 +1,5 @@
 com/nvidia/spark/rapids/*/RapidsShuffleManager*
+com/nvidia/spark/rapids/AvroProvider.class
+com/nvidia/spark/rapids/HiveProvider.class
 org/apache/spark/sql/rapids/shims/*/ProxyRapidsShuffleInternalManager*
 spark-*-info.properties
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AvroProvider.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AvroProvider.scala
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.connector.read.{PartitionReaderFactory, Scan}
+import org.apache.spark.sql.execution.FileSourceScanExec
+import org.apache.spark.sql.execution.datasources.FileFormat
+import org.apache.spark.sql.rapids.GpuFileSourceScanExec
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.util.SerializableConfiguration
+
+trait AvroProvider {
+  /** If the file format is supported as an external source */
+  def isSupportedFormat(format: FileFormat): Boolean
+
+  def isPerFileReadEnabledForFormat(format: FileFormat, conf: RapidsConf): Boolean
+
+  def tagSupportForGpuFileSourceScan(meta: SparkPlanMeta[FileSourceScanExec]): Unit
+
+  /**
+   * Get a read file format for the input format.
+   * Better to check if the format is supported first by calling 'isSupportedFormat'
+   */
+  def getReadFileFormat(format: FileFormat): FileFormat
+
+  /**
+   * Create a multi-file reader factory for the input format.
+   * Better to check if the format is supported first by calling 'isSupportedFormat'
+   */
+  def createMultiFileReaderFactory(
+      format: FileFormat,
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory
+
+  def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]]
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HiveProvider.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HiveProvider.scala
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+
+/**
+ * The subclass of HiveProvider imports spark-hive classes. This file should not imports
+ * spark-hive because `class not found` exception may throw if spark-hive does not exist at
+ * runtime. Details see: https://github.com/NVIDIA/spark-rapids/issues/5648
+ */
+trait HiveProvider {
+  def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]]
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -445,4 +445,12 @@ object ShimLoader extends Logging {
   def newExplainPlan(): ExplainPlanBase = {
     newInstanceOf[ExplainPlanBase]("com.nvidia.spark.rapids.ExplainPlanImpl")
   }
+
+  def newHiveProvider(): HiveProvider= {
+    newInstanceOf[HiveProvider]("org.apache.spark.sql.hive.rapids.HiveProviderImpl")
+  }
+
+  def newAvroProvider(): AvroProvider = ShimLoader.newInstanceOf[AvroProvider](
+    "org.apache.spark.sql.rapids.AvroProviderImpl")
+
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveOverrides.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveOverrides.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,9 @@
 
 package org.apache.spark.sql.hive.rapids
 
-import com.nvidia.spark.RapidsUDF
-import com.nvidia.spark.rapids.{ExprChecks, ExprMeta, ExprRule, GpuExpression, GpuOverrides, RapidsConf, RepeatingParamCheck, ShimLoader, TypeSig}
-import com.nvidia.spark.rapids.GpuUserDefinedFunction.udfTypeSig
+import com.nvidia.spark.rapids.{ExprRule, ShimLoader}
 
 import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.hive.{HiveGenericUDF, HiveSimpleUDF}
 
 object GpuHiveOverrides {
   def isSparkHiveAvailable: Boolean = {
@@ -39,94 +36,10 @@ object GpuHiveOverrides {
    * mapping if spark-hive is unavailable.
    */
   def exprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
-    if (!isSparkHiveAvailable) {
-      return Map.empty
+    if (isSparkHiveAvailable) {
+      ShimLoader.newHiveProvider().getExprs
+    } else {
+      Map.empty
     }
-
-    Seq(
-      GpuOverrides.expr[HiveSimpleUDF](
-        "Hive UDF, the UDF can choose to implement a RAPIDS accelerated interface to" +
-          " get better performance",
-        ExprChecks.projectOnly(
-          udfTypeSig,
-          TypeSig.all,
-          repeatingParamCheck = Some(RepeatingParamCheck("param", udfTypeSig, TypeSig.all))),
-        (a, conf, p, r) => new ExprMeta[HiveSimpleUDF](a, conf, p, r) {
-          private val opRapidsFunc = a.function match {
-            case rapidsUDF: RapidsUDF => Some(rapidsUDF)
-            case _ => None
-          }
-
-          override def tagExprForGpu(): Unit = {
-            if (opRapidsFunc.isEmpty && !conf.isCpuBasedUDFEnabled) {
-              willNotWorkOnGpu(s"Hive SimpleUDF ${a.name} implemented by " +
-                s"${a.funcWrapper.functionClassName} does not provide a GPU implementation " +
-                s"and CPU-based UDFs are not enabled by `${RapidsConf.ENABLE_CPU_BASED_UDF.key}`")
-            }
-          }
-
-          override def convertToGpu(): GpuExpression = {
-            opRapidsFunc.map { _ =>
-              // We use the original HiveGenericUDF `deterministic` method as a proxy
-              // for simplicity.
-              GpuHiveSimpleUDF(
-                a.name,
-                a.funcWrapper,
-                childExprs.map(_.convertToGpu()),
-                a.dataType,
-                a.deterministic)
-            }.getOrElse {
-              // This `require` is just for double check.
-              require(conf.isCpuBasedUDFEnabled)
-              GpuRowBasedHiveSimpleUDF(
-                a.name,
-                a.funcWrapper,
-                childExprs.map(_.convertToGpu()))
-            }
-          }
-        }),
-      GpuOverrides.expr[HiveGenericUDF](
-        "Hive Generic UDF, the UDF can choose to implement a RAPIDS accelerated interface to" +
-          " get better performance",
-        ExprChecks.projectOnly(
-          udfTypeSig,
-          TypeSig.all,
-          repeatingParamCheck = Some(RepeatingParamCheck("param", udfTypeSig, TypeSig.all))),
-        (a, conf, p, r) => new ExprMeta[HiveGenericUDF](a, conf, p, r) {
-          private val opRapidsFunc = a.function match {
-            case rapidsUDF: RapidsUDF => Some(rapidsUDF)
-            case _ => None
-          }
-
-          override def tagExprForGpu(): Unit = {
-            if (opRapidsFunc.isEmpty && !conf.isCpuBasedUDFEnabled) {
-              willNotWorkOnGpu(s"Hive GenericUDF ${a.name} implemented by " +
-                s"${a.funcWrapper.functionClassName} does not provide a GPU implementation " +
-                s"and CPU-based UDFs are not enabled by `${RapidsConf.ENABLE_CPU_BASED_UDF.key}`")
-            }
-          }
-
-          override def convertToGpu(): GpuExpression = {
-            opRapidsFunc.map { _ =>
-              // We use the original HiveGenericUDF `deterministic` method as a proxy
-              // for simplicity.
-              GpuHiveGenericUDF(
-                a.name,
-                a.funcWrapper,
-                childExprs.map(_.convertToGpu()),
-                a.dataType,
-                a.deterministic,
-                a.foldable)
-            }.getOrElse {
-              // This `require` is just for double check.
-              require(conf.isCpuBasedUDFEnabled)
-              GpuRowBasedHiveGenericUDF(
-                a.name,
-                a.funcWrapper,
-                childExprs.map(_.convertToGpu()))
-            }
-          }
-        })
-    ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
   }
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.rapids
+
+import com.nvidia.spark.RapidsUDF
+import com.nvidia.spark.rapids.{ExprChecks, ExprMeta, ExprRule, GpuExpression, GpuOverrides, HiveProvider, RapidsConf, RepeatingParamCheck, TypeSig}
+import com.nvidia.spark.rapids.GpuUserDefinedFunction.udfTypeSig
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.hive.{HiveGenericUDF, HiveSimpleUDF}
+
+class HiveProviderImpl extends HiveProvider {
+
+  /**
+   * Builds the rules that are specific to spark-hive Catalyst nodes. This will return an empty
+   * mapping if spark-hive is unavailable.
+   */
+  override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
+    Seq(
+      GpuOverrides.expr[HiveSimpleUDF](
+        "Hive UDF, the UDF can choose to implement a RAPIDS accelerated interface to" +
+            " get better performance",
+        ExprChecks.projectOnly(
+          udfTypeSig,
+          TypeSig.all,
+          repeatingParamCheck = Some(RepeatingParamCheck("param", udfTypeSig, TypeSig.all))),
+        (a, conf, p, r) => new ExprMeta[HiveSimpleUDF](a, conf, p, r) {
+          private val opRapidsFunc = a.function match {
+            case rapidsUDF: RapidsUDF => Some(rapidsUDF)
+            case _ => None
+          }
+
+          override def tagExprForGpu(): Unit = {
+            if (opRapidsFunc.isEmpty && !conf.isCpuBasedUDFEnabled) {
+              willNotWorkOnGpu(s"Hive SimpleUDF ${a.name} implemented by " +
+                  s"${a.funcWrapper.functionClassName} does not provide a GPU implementation " +
+                  s"and CPU-based UDFs are not enabled by `${RapidsConf.ENABLE_CPU_BASED_UDF.key}`")
+            }
+          }
+
+          override def convertToGpu(): GpuExpression = {
+            opRapidsFunc.map { _ =>
+              // We use the original HiveGenericUDF `deterministic` method as a proxy
+              // for simplicity.
+              GpuHiveSimpleUDF(
+                a.name,
+                a.funcWrapper,
+                childExprs.map(_.convertToGpu()),
+                a.dataType,
+                a.deterministic)
+            }.getOrElse {
+              // This `require` is just for double check.
+              require(conf.isCpuBasedUDFEnabled)
+              GpuRowBasedHiveSimpleUDF(
+                a.name,
+                a.funcWrapper,
+                childExprs.map(_.convertToGpu()))
+            }
+          }
+        }),
+      GpuOverrides.expr[HiveGenericUDF](
+        "Hive Generic UDF, the UDF can choose to implement a RAPIDS accelerated interface to" +
+            " get better performance",
+        ExprChecks.projectOnly(
+          udfTypeSig,
+          TypeSig.all,
+          repeatingParamCheck = Some(RepeatingParamCheck("param", udfTypeSig, TypeSig.all))),
+        (a, conf, p, r) => new ExprMeta[HiveGenericUDF](a, conf, p, r) {
+          private val opRapidsFunc = a.function match {
+            case rapidsUDF: RapidsUDF => Some(rapidsUDF)
+            case _ => None
+          }
+
+          override def tagExprForGpu(): Unit = {
+            if (opRapidsFunc.isEmpty && !conf.isCpuBasedUDFEnabled) {
+              willNotWorkOnGpu(s"Hive GenericUDF ${a.name} implemented by " +
+                  s"${a.funcWrapper.functionClassName} does not provide a GPU implementation " +
+                  s"and CPU-based UDFs are not enabled by `${RapidsConf.ENABLE_CPU_BASED_UDF.key}`")
+            }
+          }
+
+          override def convertToGpu(): GpuExpression = {
+            opRapidsFunc.map { _ =>
+              // We use the original HiveGenericUDF `deterministic` method as a proxy
+              // for simplicity.
+              GpuHiveGenericUDF(
+                a.name,
+                a.funcWrapper,
+                childExprs.map(_.convertToGpu()),
+                a.dataType,
+                a.deterministic,
+                a.foldable)
+            }.getOrElse {
+              // This `require` is just for double check.
+              require(conf.isCpuBasedUDFEnabled)
+              GpuRowBasedHiveGenericUDF(
+                a.name,
+                a.funcWrapper,
+                childExprs.map(_.convertToGpu()))
+            }
+          }
+        })
+    ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
+  }
+}