From 9afb81b45f4eadc1c3c72d8cd0b68e654697d8c3 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Thu, 12 Nov 2020 23:01:32 +0800 Subject: [PATCH 1/3] Auto-register UDF extention when main plugin is set Signed-off-by: Allen Xu --- .../scala/com/nvidia/spark/rapids/Plugin.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index 337d9dc2bb5..06a6aabcd9a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -60,6 +60,7 @@ class SQLExecPlugin extends (SparkSessionExtensions => Unit) with Logging { object RapidsPluginUtils extends Logging { private val SQL_PLUGIN_NAME = classOf[SQLExecPlugin].getName + private val UDF_PLUGIN_NAME = "com.nvidia.spark.udf.Plugin" private val SQL_PLUGIN_CONF_KEY = StaticSQLConf.SPARK_SESSION_EXTENSIONS.key private val SERIALIZER_CONF_KEY = "spark.serializer" private val JAVA_SERIALIZER_NAME = classOf[JavaSerializer].getName @@ -70,14 +71,16 @@ object RapidsPluginUtils extends Logging { def fixupConfigs(conf: SparkConf): Unit = { // First add in the SQL executor plugin because that is what we need at a minimum if (conf.contains(SQL_PLUGIN_CONF_KEY)) { - val previousValue = conf.get(SQL_PLUGIN_CONF_KEY).split(",").map(_.trim) - if (!previousValue.contains(SQL_PLUGIN_NAME)) { - conf.set(SQL_PLUGIN_CONF_KEY, (previousValue :+ SQL_PLUGIN_NAME).mkString(",")) - } else { - conf.set(SQL_PLUGIN_CONF_KEY, previousValue.mkString(",")) + for (pluginName <- Array(SQL_PLUGIN_NAME, UDF_PLUGIN_NAME)){ + val previousValue = conf.get(SQL_PLUGIN_CONF_KEY).split(",").map(_.trim) + if (!previousValue.contains(pluginName)) { + conf.set(SQL_PLUGIN_CONF_KEY, (previousValue :+ pluginName).mkString(",")) + } else { + conf.set(SQL_PLUGIN_CONF_KEY, previousValue.mkString(",")) + } } } else { - conf.set(SQL_PLUGIN_CONF_KEY, SQL_PLUGIN_NAME) + conf.set(SQL_PLUGIN_CONF_KEY, Array(SQL_PLUGIN_NAME,UDF_PLUGIN_NAME).mkString(",")) } val serializer = conf.get(SERIALIZER_CONF_KEY, JAVA_SERIALIZER_NAME) From 78e7fcc1e8dc94bf8940d53e2a198986ebd32b2f Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Fri, 13 Nov 2020 22:29:11 +0800 Subject: [PATCH 2/3] Update udf-compiler descriptions in related docs Signed-off-by: Allen Xu --- docs/compatibility.md | 4 ++-- udf-compiler/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 80720517904..eae7cd3344a 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -290,10 +290,10 @@ Casting from string to timestamp currently has the following limitations. Only timezone 'Z' (UTC) is supported. Casting unsupported formats will result in null values. ## UDF to Catalyst Expressions -To speedup the process of UDF, spark-rapids introduces a udf-compiler extension to translate UDFs to Catalyst expressions. +To speedup the process of UDF, spark-rapids introduces a udf-compiler extension to translate UDFs to Catalyst expressions. This compiler will be injected automatically to spark extensions by setting `spark.plugins=com.nvidia.spark.SQLPlugin` and is disabled by default. To enable this operation on the GPU, set -[`spark.rapids.sql.udfCompiler.enabled`](configs.md#sql.udfCompiler.enabled) to `true`, and `spark.sql.extensions=com.nvidia.spark.udf.Plugin`. +[`spark.rapids.sql.udfCompiler.enabled`](configs.md#sql.udfCompiler.enabled) to `true`. However, Spark may produce different results for a compiled udf and the non-compiled. For example: a udf of `x/y` where `y` happens to be `0`, the compiled catalyst expressions will return `NULL` while the original udf would fail the entire job with a `java.lang.ArithmeticException: / by zero` diff --git a/udf-compiler/README.md b/udf-compiler/README.md index fe76bd273e2..0a8ca4e9ba7 100644 --- a/udf-compiler/README.md +++ b/udf-compiler/README.md @@ -14,6 +14,6 @@ How to run ---------- The UDF compiler is included in the rapids-4-spark jar that is produced by the `dist` maven project. Set up your cluster to run the RAPIDS Accelerator for Apache Spark -and set the spark config `spark.sql.extensions` to include `com.nvidia.spark.udf.Plugin`. +and this udf plugin will be automatically injected to spark extensions when `com.nvidia.spark.SQLPlugin` is set. The plugin is still disabled by default and you will need to set `spark.rapids.sql.udfCompiler.enabled` to `true` to enable it. From c80d1a298018be32e1a2489550716dbf1e7e92b0 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Fri, 13 Nov 2020 22:40:44 +0800 Subject: [PATCH 3/3] Remove unecessary lines to make more user-friendly Signed-off-by: Allen Xu --- docs/compatibility.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index eae7cd3344a..718be38b14e 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -290,7 +290,7 @@ Casting from string to timestamp currently has the following limitations. Only timezone 'Z' (UTC) is supported. Casting unsupported formats will result in null values. ## UDF to Catalyst Expressions -To speedup the process of UDF, spark-rapids introduces a udf-compiler extension to translate UDFs to Catalyst expressions. This compiler will be injected automatically to spark extensions by setting `spark.plugins=com.nvidia.spark.SQLPlugin` and is disabled by default. +To speedup the process of UDF, spark-rapids introduces a udf-compiler extension to translate UDFs to Catalyst expressions. To enable this operation on the GPU, set [`spark.rapids.sql.udfCompiler.enabled`](configs.md#sql.udfCompiler.enabled) to `true`.