diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py index fdb937014ef..a48e7cc3f23 100644 --- a/jenkins/databricks/clusterutils.py +++ b/jenkins/databricks/clusterutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,6 +46,13 @@ def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout, templ['driver_node_type_id'] = driver_node_type templ['ssh_public_keys'] = [ sshKey ] templ['num_workers'] = num_workers + templ['init_scripts'] = [ + { + "dbfs": { + "destination": "dbfs:/databricks/init_scripts/init_cudf_udf.sh" + } + } + ] return templ diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh new file mode 100644 index 00000000000..70758a7de91 --- /dev/null +++ b/jenkins/databricks/init_cudf_udf.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# The initscript to set up environment for the cudf_udf tests on Databrcks +# Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated. + +CUDF_VER=${CUDF_VER:-0.19} + +# Use mamba to install cudf-udf packages to speed up conda resolve time +base=$(conda info --base) +conda create -y -n mamba -c conda-forge mamba +pip uninstall -y pyarrow +${base}/envs/mamba/bin/mamba remove -y c-ares zstd libprotobuf pandas +${base}/envs/mamba/bin/mamba install -y pyarrow=1.0.1 -c conda-forge +${base}/envs/mamba/bin/mamba install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults cudf=$CUDF_VER cudatoolkit=10.1 +conda env remove -n mamba diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh index baefe8b6da7..95e196def39 100755 --- a/jenkins/databricks/test.sh +++ b/jenkins/databricks/test.sh @@ -33,10 +33,25 @@ sudo chmod 777 /databricks/data/logs/ sudo chmod 777 /databricks/data/logs/* echo { \"port\":\"15002\" } > ~/.databricks-connect +CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \ + --conf spark.rapids.memory.gpu.allocFraction=0.1 \ + --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \ + --conf spark.rapids.python.concurrentPythonWorkers=2" + if [ -d "$LOCAL_JAR_PATH" ]; then ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo - LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" + LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" + + ## Run cudf-udf tests + CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`" + LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \ + bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf else ## Run tests with jars building from the spark-rapids source code bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" + + ## Run cudf-udf tests + CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`" + SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \ + bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf fi