From 36dc282c7c36295b5abf1b241f492e5566afb3dd Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 15 Sep 2020 16:36:08 -0500 Subject: [PATCH] Update cudf dependency to 0.16-SNAPSHOT (#727) * Update cudf dependency to 0.16-SNAPSHOT Signed-off-by: Jason Lowe * Update docs to reflect new artifact versions * Exclude rmm_log.txt --- docs/configs.md | 2 +- docs/get-started/Dockerfile.cuda | 4 ++-- docs/get-started/getting-started-on-prem.md | 10 +++++----- docs/testing.md | 4 ++-- integration_tests/README.md | 4 ++-- integration_tests/pom.xml | 1 + jenkins/Jenkinsfile.databricksnightly | 2 +- jenkins/databricks/run-tests.py | 2 +- jenkins/spark-tests.sh | 2 +- jenkins/version-def.sh | 2 +- pom.xml | 2 +- .../scala/com/nvidia/spark/rapids/RapidsConf.scala | 2 +- tests/pom.xml | 1 + 13 files changed, 20 insertions(+), 18 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 924e1f0b50e..2e09a5aabf0 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.2.0.jar,cudf-0.15-cuda10-1.jar' \ +${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.16-SNAPSHOT-cuda10-1.jar' \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.incompatibleOps.enabled=true ``` diff --git a/docs/get-started/Dockerfile.cuda b/docs/get-started/Dockerfile.cuda index 09631faf928..68a4830a464 100644 --- a/docs/get-started/Dockerfile.cuda +++ b/docs/get-started/Dockerfile.cuda @@ -53,8 +53,8 @@ COPY spark-3.0.1-bin-hadoop3.2/examples /opt/spark/examples COPY spark-3.0.1-bin-hadoop3.2/kubernetes/tests /opt/spark/tests COPY spark-3.0.1-bin-hadoop3.2/data /opt/spark/data -COPY cudf-0.15-cuda10-1.jar /opt/sparkRapidsPlugin -COPY rapids-4-spark_2.12-0.2.0.jar /opt/sparkRapidsPlugin +COPY cudf-0.16-SNAPSHOT-cuda10-1.jar /opt/sparkRapidsPlugin +COPY rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar /opt/sparkRapidsPlugin COPY getGpusResources.sh /opt/sparkRapidsPlugin RUN mkdir /opt/spark/python diff --git a/docs/get-started/getting-started-on-prem.md b/docs/get-started/getting-started-on-prem.md index bf03efd3c9d..41e4e4bdec3 100644 --- a/docs/get-started/getting-started-on-prem.md +++ b/docs/get-started/getting-started-on-prem.md @@ -55,16 +55,16 @@ CUDA and will not run on other versions. The jars use a maven classifier to keep - CUDA 11.0 => classifier cuda11 For example, here is a sample version of the jars and cudf with CUDA 10.1 support: -- cudf-0.15-cuda10-1.jar -- rapids-4-spark_2.12-0.2.0.jar +- cudf-0.16-SNAPSHOT-cuda10-1.jar +- rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar For simplicity export the location to these jars. This example assumes the sample jars above have been placed in the `/opt/sparkRapidsPlugin` directory: ```shell export SPARK_RAPIDS_DIR=/opt/sparkRapidsPlugin -export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.15-cuda10-1.jar -export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-0.2.0.jar +export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.16-SNAPSHOT-cuda10-1.jar +export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar ``` ## Install the GPU Discovery Script @@ -512,7 +512,7 @@ To enable _GPU Scheduling for Pandas UDF_, you need to configure your spark job On Standalone, you need to add ```shell ... - --conf spark.executorEnv.PYTHONPATH=rapids-4-spark_2.12-0.2.0.jar \ + --conf spark.executorEnv.PYTHONPATH=rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar \ --py-files ${SPARK_RAPIDS_PLUGIN_JAR} ``` diff --git a/docs/testing.md b/docs/testing.md index acce116ba67..fc720e071a2 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -20,7 +20,7 @@ we typically run with the default options and only increase the scale factor dep dbgen -b dists.dss -s 10 ``` -You can include the test jar `rapids-4-spark-integration-tests_2.12-0.2.0.jar` with the +You can include the test jar `rapids-4-spark-integration-tests_2.12-0.3.0-SNAPSHOT.jar` with the Spark --jars option to get the TPCH tests. To setup for the queries you can run `TpchLikeSpark.setupAllCSV` for CSV formatted data or `TpchLikeSpark.setupAllParquet` for parquet formatted data. Both of those take the Spark session, and a path to the dbgen @@ -83,7 +83,7 @@ individually, so you don't risk running unit tests along with the integration te http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-0.2.0-tests.jar,rapids-4-spark-integration-tests_2.12-0.2.0-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-0.3.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-0.3.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you diff --git a/integration_tests/README.md b/integration_tests/README.md index b2e01f2ce59..05364fd573c 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -49,7 +49,7 @@ Most clusters probably will not have the RAPIDS plugin installed in the cluster If just want to verify the SQL replacement is working you will need to add the `rapids-4-spark` and `cudf` jars to your `spark-submit` command. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar,cudf-0.15.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.16-SNAPSHOT.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -80,7 +80,7 @@ The TPCxBB, TPCH, and Mortgage tests in this framework can be enabled by providi As an example, here is the `spark-submit` command with the TPCxBB parameters: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar,cudf-0.15.jar,rapids-4-spark-tests_2.12-0.2.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.16-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.3.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" ``` ## Writing tests diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 897e891c1f2..182fd9dd8c2 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -172,6 +172,7 @@ src/test/resources/** **/*.md .pytest_cache/** + rmm_log.txt diff --git a/jenkins/Jenkinsfile.databricksnightly b/jenkins/Jenkinsfile.databricksnightly index 82d903c49e1..a5810a38bfa 100644 --- a/jenkins/Jenkinsfile.databricksnightly +++ b/jenkins/Jenkinsfile.databricksnightly @@ -46,7 +46,7 @@ pipeline { string(name: 'DATABRICKS_VERSION', defaultValue: '0.3.0-SNAPSHOT', description: 'Version to set') string(name: 'CUDF_VERSION', - defaultValue: '0.15', description: 'Cudf version to use') + defaultValue: '0.16-SNAPSHOT', description: 'Cudf version to use') string(name: 'CUDA_VERSION', defaultValue: 'cuda10-1', description: 'cuda version to use') string(name: 'CLUSTER_ID', diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py index 24ae2b22624..8b4bd4ba753 100644 --- a/jenkins/databricks/run-tests.py +++ b/jenkins/databricks/run-tests.py @@ -49,7 +49,7 @@ def main(): db_version = '0.1-databricks-SNAPSHOT' scala_version = '2.12' spark_version = '3.0.0' - cudf_version = '0.15' + cudf_version = '0.16-SNAPSHOT' cuda_version = 'cuda10-1' ci_cudf_jar = 'cudf-0.14-cuda10-1.jar' base_spark_pom_version = '3.0.0' diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index 7c02d0cdcfd..677e5919c8b 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -73,7 +73,7 @@ MORTGAGE_SPARK_SUBMIT_ARGS=" --conf spark.plugins=com.nvidia.spark.SQLPlugin \ # need to disable pooling for udf test to prevent cudaErrorMemoryAllocation CUDF_UDF_TEST_ARGS="--conf spark.rapids.python.memory.gpu.pooling.enabled=false \ --conf spark.rapids.memory.gpu.pooling.enabled=false \ - --conf spark.executorEnv.PYTHONPATH=rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar \ + --conf spark.executorEnv.PYTHONPATH=rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar \ --py-files ${RAPIDS_PLUGIN_JAR}" TEST_PARAMS="$SPARK_VER $PARQUET_PERF $PARQUET_ACQ $OUTPUT" diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index d2160d0d317..ea83ecd22d0 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -26,7 +26,7 @@ for VAR in $OVERWRITE_PARAMS;do done IFS=$PRE_IFS -CUDF_VER=${CUDF_VER:-"0.15"} +CUDF_VER=${CUDF_VER:-"0.16-SNAPSHOT"} CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda10-1"} PROJECT_VER=${PROJECT_VER:-"0.3.0-SNAPSHOT"} SPARK_VER=${SPARK_VER:-"3.0.0"} diff --git a/pom.xml b/pom.xml index 7a7f1e97473..6befd97a2f4 100644 --- a/pom.xml +++ b/pom.xml @@ -149,7 +149,7 @@ 1.8 3.0.0 cuda10-1 - 0.15 + 0.16-SNAPSHOT 2.12 2.12.8 1.5.8 diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 4c825a32307..b5a8d079cd8 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -729,7 +729,7 @@ object RapidsConf { |On startup use: `--conf [conf key]=[conf value]`. For example: | |``` - |${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.2.0.jar,cudf-0.15-cuda10-1.jar' \ + |${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-0.3.0-SNAPSHOT.jar,cudf-0.16-SNAPSHOT-cuda10-1.jar' \ |--conf spark.plugins=com.nvidia.spark.SQLPlugin \ |--conf spark.rapids.sql.incompatibleOps.enabled=true |``` diff --git a/tests/pom.xml b/tests/pom.xml index 30c7829306d..a853114b912 100644 --- a/tests/pom.xml +++ b/tests/pom.xml @@ -134,6 +134,7 @@ src/test/resources/** + rmm_log.txt