From dc6fbcf84f150fab565edfca2645344290bb8455 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Mon, 14 Dec 2020 17:44:30 -0600 Subject: [PATCH] Update cudf version to 0.17 (#1387) Signed-off-by: Jason Lowe --- docs/configs.md | 2 +- docs/get-started/Dockerfile.cuda | 2 +- docs/get-started/getting-started-on-prem.md | 4 ++-- integration_tests/README.md | 14 ++++++++------ jenkins/version-def.sh | 2 +- pom.xml | 2 +- .../scala/com/nvidia/spark/rapids/RapidsConf.scala | 2 +- 7 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 71a0701a5af..640c21e74ba 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-SNAPSHOT-cuda10-1.jar' \ +${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-cuda10-1.jar' \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.incompatibleOps.enabled=true ``` diff --git a/docs/get-started/Dockerfile.cuda b/docs/get-started/Dockerfile.cuda index 84a42b68a66..c9bc0cba035 100644 --- a/docs/get-started/Dockerfile.cuda +++ b/docs/get-started/Dockerfile.cuda @@ -53,7 +53,7 @@ COPY spark-3.0.1-bin-hadoop3.2/examples /opt/spark/examples COPY spark-3.0.1-bin-hadoop3.2/kubernetes/tests /opt/spark/tests COPY spark-3.0.1-bin-hadoop3.2/data /opt/spark/data -COPY cudf-0.17-SNAPSHOT-cuda10-1.jar /opt/sparkRapidsPlugin +COPY cudf-0.17-cuda10-1.jar /opt/sparkRapidsPlugin COPY rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar /opt/sparkRapidsPlugin COPY getGpusResources.sh /opt/sparkRapidsPlugin diff --git a/docs/get-started/getting-started-on-prem.md b/docs/get-started/getting-started-on-prem.md index 427cd1fb42f..d66aff78e4a 100644 --- a/docs/get-started/getting-started-on-prem.md +++ b/docs/get-started/getting-started-on-prem.md @@ -55,7 +55,7 @@ CUDA and will not run on other versions. The jars use a maven classifier to keep - CUDA 11.0 => classifier cuda11 For example, here is a sample version of the jars and cudf with CUDA 10.1 support: -- cudf-0.17-SNAPSHOT-cuda10-1.jar +- cudf-0.17-cuda10-1.jar - rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar @@ -63,7 +63,7 @@ For simplicity export the location to these jars. This example assumes the sampl been placed in the `/opt/sparkRapidsPlugin` directory: ```shell export SPARK_RAPIDS_DIR=/opt/sparkRapidsPlugin -export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.17-SNAPSHOT-cuda10-1.jar +export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.17-cuda10-1.jar export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar ``` diff --git a/integration_tests/README.md b/integration_tests/README.md index bb347d2ed68..c57ab672534 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -125,10 +125,12 @@ durations.run(new com.nvidia.spark.rapids.JoinsSuite) ``` Most clusters probably will not have the RAPIDS plugin installed in the cluster yet. -If you just want to verify the SQL replacement is working you will need to add the `rapids-4-spark` and `cudf` jars to your `spark-submit` command. +If you just want to verify the SQL replacement is working you will need to add the +`rapids-4-spark` and `cudf` jars to your `spark-submit` command. Note the following +example assumes CUDA 10.1 is being used. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-SNAPSHOT.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-cuda10-1.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -177,10 +179,10 @@ The TPCxBB, TPCH, TPCDS, and Mortgage tests in this framework can be enabled by * TPCDS `tpcds-format` (optional, defaults to "parquet"), and `tpcds-path` (required, path to the TPCDS data). * Mortgage `mortgage-format` (optional, defaults to "parquet"), and `mortgage-path` (required, path to the Mortgage data). -As an example, here is the `spark-submit` command with the TPCxBB parameters: +As an example, here is the `spark-submit` command with the TPCxBB parameters on CUDA 10.1: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.3.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-cuda10-1.jar,rapids-4-spark-tests_2.12-0.3.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" ``` Be aware that running these tests with read data requires at least an entire GPU, and preferable several GPUs/executors @@ -206,10 +208,10 @@ To run cudf_udf tests, need following configuration changes: * Decrease `spark.rapids.memory.gpu.allocFraction` to reserve enough GPU memory for Python processes in case of out-of-memory. * Add `spark.rapids.python.concurrentPythonWorkers` and `spark.rapids.python.memory.gpu.allocFraction` to reserve enough GPU memory for Python processes in case of out-of-memory. -As an example, here is the `spark-submit` command with the cudf_udf parameter: +As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 10.1: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.3.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-cuda10-1.jar,rapids-4-spark-tests_2.12-0.3.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar" ./runtests.py --cudf_udf ``` ## Writing tests diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index d61315feaa9..c0c153dd2ae 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -26,7 +26,7 @@ for VAR in $OVERWRITE_PARAMS;do done IFS=$PRE_IFS -CUDF_VER=${CUDF_VER:-"0.17-SNAPSHOT"} +CUDF_VER=${CUDF_VER:-"0.17"} CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda10-1"} PROJECT_VER=${PROJECT_VER:-"0.3.0-SNAPSHOT"} SPARK_VER=${SPARK_VER:-"3.0.0"} diff --git a/pom.xml b/pom.xml index 5e18b8fa219..6bdcc70920e 100644 --- a/pom.xml +++ b/pom.xml @@ -143,7 +143,7 @@ 1.8 3.0.0 cuda10-1 - 0.17-SNAPSHOT + 0.17 2.12 2.12.8 1.5.8 diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index fa2fc2f72e8..0fc5c4e7fac 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -824,7 +824,7 @@ object RapidsConf { |On startup use: `--conf [conf key]=[conf value]`. For example: | |``` - |${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-SNAPSHOT-cuda10-1.jar' \ + |${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.17-cuda10-1.jar' \ |--conf spark.plugins=com.nvidia.spark.SQLPlugin \ |--conf spark.rapids.sql.incompatibleOps.enabled=true |```