From 3d9f74889a8e3768d4822bd66ae23cc6b2938e37 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Thu, 1 Apr 2021 15:43:52 +0800 Subject: [PATCH 1/2] Run the pands udf using cudf on Databricks Issue: 2026 Add the init script to set up environment for the cudf_udf tests on Databrcks Run cudf-udf test cases nightly Signed-off-by: Tim Liu --- jenkins/databricks/clusterutils.py | 9 ++++++++- jenkins/databricks/init_cudf_udf.sh | 30 +++++++++++++++++++++++++++++ jenkins/databricks/test.sh | 17 +++++++++++++++- 3 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 jenkins/databricks/init_cudf_udf.sh diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py index fdb937014ef..a48e7cc3f23 100644 --- a/jenkins/databricks/clusterutils.py +++ b/jenkins/databricks/clusterutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,6 +46,13 @@ def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout, templ['driver_node_type_id'] = driver_node_type templ['ssh_public_keys'] = [ sshKey ] templ['num_workers'] = num_workers + templ['init_scripts'] = [ + { + "dbfs": { + "destination": "dbfs:/databricks/init_scripts/init_cudf_udf.sh" + } + } + ] return templ diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh new file mode 100644 index 00000000000..70758a7de91 --- /dev/null +++ b/jenkins/databricks/init_cudf_udf.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# The initscript to set up environment for the cudf_udf tests on Databrcks +# Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated. + +CUDF_VER=${CUDF_VER:-0.19} + +# Use mamba to install cudf-udf packages to speed up conda resolve time +base=$(conda info --base) +conda create -y -n mamba -c conda-forge mamba +pip uninstall -y pyarrow +${base}/envs/mamba/bin/mamba remove -y c-ares zstd libprotobuf pandas +${base}/envs/mamba/bin/mamba install -y pyarrow=1.0.1 -c conda-forge +${base}/envs/mamba/bin/mamba install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults cudf=$CUDF_VER cudatoolkit=10.1 +conda env remove -n mamba diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh index baefe8b6da7..c65dd915110 100755 --- a/jenkins/databricks/test.sh +++ b/jenkins/databricks/test.sh @@ -33,10 +33,25 @@ sudo chmod 777 /databricks/data/logs/ sudo chmod 777 /databricks/data/logs/* echo { \"port\":\"15002\" } > ~/.databricks-connect +SPARK_SUBMIT_FLAGS="--conf spark.python.daemon.module=rapids.daemon_databricks \ + --conf spark.rapids.memory.gpu.allocFraction=0.1 \ + --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \ + --conf spark.rapids.python.concurrentPythonWorkers=2" + if [ -d "$LOCAL_JAR_PATH" ]; then ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo - LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" + LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" + + ## Run cudf-udf tests + SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`" + LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \ + bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf else ## Run tests with jars building from the spark-rapids source code bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" + + ## Run cudf-udf tests + SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`" + SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \ + bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf fi From 7b96dd63348b508bbbed782310f4058a5ce870b7 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Fri, 2 Apr 2021 16:16:06 +0800 Subject: [PATCH 2/2] Update, user 'CUDF_UDF_TEST_ARGS' --- jenkins/databricks/test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh index c65dd915110..95e196def39 100755 --- a/jenkins/databricks/test.sh +++ b/jenkins/databricks/test.sh @@ -33,7 +33,7 @@ sudo chmod 777 /databricks/data/logs/ sudo chmod 777 /databricks/data/logs/* echo { \"port\":\"15002\" } > ~/.databricks-connect -SPARK_SUBMIT_FLAGS="--conf spark.python.daemon.module=rapids.daemon_databricks \ +CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \ --conf spark.rapids.memory.gpu.allocFraction=0.1 \ --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \ --conf spark.rapids.python.concurrentPythonWorkers=2" @@ -43,15 +43,15 @@ if [ -d "$LOCAL_JAR_PATH" ]; then LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" ## Run cudf-udf tests - SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`" - LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \ + CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`" + LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \ bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf else ## Run tests with jars building from the spark-rapids source code bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" ## Run cudf-udf tests - SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`" - SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \ + CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`" + SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \ bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf fi