Skip to content

Commit

Permalink
Sanity checks for cudf jar mismatch (NVIDIA#1047)
Browse files Browse the repository at this point in the history
* Sanity checks for cudf jar mismatch

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* addressed review comments

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* addressed review comments

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* log warnings if the config is set but versions mismatch

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* addressed review comments

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* addressed review comments

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* refactored code and addressed review comments

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* remove unwanted comment

Signed-off-by: Niranjan Artal <nartal@nvidia.com>

* addressed review comments

Signed-off-by: Niranjan Artal <nartal@nvidia.com>
  • Loading branch information
nartal1 authored Nov 4, 2020
1 parent 0f1f589 commit 7af65a0
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 1 deletion.
3 changes: 2 additions & 1 deletion build/build-info
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@

echo_build_properties() {
echo version=$1
echo cudf_version=$2
echo user=$USER
echo revision=$(git rev-parse HEAD)
echo branch=$(git rev-parse --abbrev-ref HEAD)
echo date=$(date -u +%Y-%m-%dT%H:%M:%SZ)
echo url=$(git config --get remote.origin.url)
}

echo_build_properties $1
echo_build_properties $1 $2
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@
<exec executable="bash" output="${project.build.directory}/extra-resources/rapids4spark-version-info.properties">
<arg value="${project.basedir}/../build/build-info"/>
<arg value="${project.version}"/>
<arg value="${cudf.version}"/>
</exec>
</target>
</configuration>
Expand Down
56 changes: 56 additions & 0 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.nvidia.spark.rapids

import java.util
import java.util.Properties
import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}

import scala.collection.JavaConverters._
Expand All @@ -35,6 +36,7 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
import org.apache.spark.sql.internal.StaticSQLConf
import org.apache.spark.sql.util.QueryExecutionListener


case class ColumnarOverrideRules() extends ColumnarRule with Logging {
val overrides: Rule[SparkPlan] = GpuOverrides()
val overrideTransitions: Rule[SparkPlan] = new GpuTransitionOverrides()
Expand Down Expand Up @@ -128,6 +130,11 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
ShimLoader.setSparkShimProviderClass(conf.shimsProviderOverride.get)
}

// Compare if the cudf version mentioned in the classpath is equal to the version which
// plugin expects. If there is a version mismatch, throw error. This check can be disabled
// by setting this config spark.rapids.cudfVersionOverride=true
checkCudfVersion(conf)

// we rely on the Rapids Plugin being run with 1 GPU per executor so we can initialize
// on executor startup.
if (!GpuDeviceManager.rmmTaskInitEnabled) {
Expand All @@ -146,6 +153,55 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
}
}

private def checkCudfVersion(conf: RapidsConf): Unit = {
try {
val cudfPropertiesFileName = "cudf-java-version-info.properties"
val pluginPropertiesFileName = "rapids4spark-version-info.properties"

val props = new Properties
val classLoader = classOf[RapidsExecutorPlugin].getClassLoader
val cudfProperties = classLoader.getResourceAsStream(cudfPropertiesFileName)
if (cudfProperties == null) {
throw CudfVersionMismatchException(s"Could not find properties file " +
s"$cudfPropertiesFileName in the cudf jar. Cannot verify cudf version compatibility " +
s"with RAPIDS Accelerator version.")
}
props.load(cudfProperties)

val classpathCudfVersion = props.get("version")
if (classpathCudfVersion == null) {
throw CudfVersionMismatchException(s"Property name `version` not found in " +
s"$cudfPropertiesFileName file.")
}
val cudfVersion = classpathCudfVersion.toString

val pluginResource = classLoader.getResourceAsStream(pluginPropertiesFileName)
if (pluginResource == null) {
throw CudfVersionMismatchException(s"Could not find properties file " +
s"$pluginPropertiesFileName in the RAPIDS Accelerator jar. Cannot verify cudf " +
s"version compatibility with RAPIDS Accelerator version.")
}
props.load(pluginResource)

val pluginCudfVersion = props.get("cudf_version")
if (pluginCudfVersion == null) {
throw CudfVersionMismatchException(s"Property name `cudf_version` not found in" +
s" $pluginPropertiesFileName file.")
}
val expectedCudfVersion = pluginCudfVersion.toString
// compare cudf version in the classpath with the cudf version expected by plugin
if (!cudfVersion.equals(expectedCudfVersion)) {
throw CudfVersionMismatchException(s"Cudf version in the classpath is different. " +
s"Found $cudfVersion, RAPIDS Accelerator expects $expectedCudfVersion")
}
} catch {
case x: CudfVersionMismatchException if conf.cudfVersionOverride =>
logWarning(s"${x.errorMsg}")
}
}

case class CudfVersionMismatchException(errorMsg: String) extends RuntimeException(errorMsg)

override def shutdown(): Unit = {
GpuSemaphore.shutdown()
PythonWorkerSemaphore.shutdown()
Expand Down
10 changes: 10 additions & 0 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,14 @@ object RapidsConf {
.stringConf
.createOptional

val CUDF_VERSION_OVERRIDE = conf("spark.rapids.cudfVersionOverride")
.internal()
.doc("Overrides the cudf version compatibility check between cudf jar and RAPIDS Accelerator " +
"jar. If you are sure that the cudf jar which is mentioned in the classpath is compatible " +
"with the RAPIDS Accelerator version, then set this to true.")
.booleanConf
.createWithDefault(false)

private def printSectionHeader(category: String): Unit =
println(s"\n### $category")

Expand Down Expand Up @@ -1040,6 +1048,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {

lazy val shimsProviderOverride: Option[String] = get(SHIMS_PROVIDER_OVERRIDE)

lazy val cudfVersionOverride: Boolean = get(CUDF_VERSION_OVERRIDE)

lazy val getCloudSchemes: Option[Seq[String]] = get(CLOUD_SCHEMES)

def isOperatorEnabled(key: String, incompat: Boolean, isDisabledByDefault: Boolean): Boolean = {
Expand Down

0 comments on commit 7af65a0

Please sign in to comment.