From 3d9f74889a8e3768d4822bd66ae23cc6b2938e37 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Thu, 1 Apr 2021 15:43:52 +0800
Subject: [PATCH 1/2] Run the pands udf using cudf on Databricks

Issue: 2026

Add the init script to set up environment for the cudf_udf tests on Databrcks

Run cudf-udf test cases nightly

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 jenkins/databricks/clusterutils.py  |  9 ++++++++-
 jenkins/databricks/init_cudf_udf.sh | 30 +++++++++++++++++++++++++++++
 jenkins/databricks/test.sh          | 17 +++++++++++++++-
 3 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 jenkins/databricks/init_cudf_udf.sh

diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py
index fdb937014ef..a48e7cc3f23 100644
--- a/jenkins/databricks/clusterutils.py
+++ b/jenkins/databricks/clusterutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,6 +46,13 @@ def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout,
         templ['driver_node_type_id'] = driver_node_type
         templ['ssh_public_keys'] = [ sshKey ]
         templ['num_workers'] = num_workers
+        templ['init_scripts'] = [
+            {
+                "dbfs": {
+                     "destination": "dbfs:/databricks/init_scripts/init_cudf_udf.sh"
+                }
+            }
+        ]
         return templ
 
 
diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
new file mode 100644
index 00000000000..70758a7de91
--- /dev/null
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# The initscript to set up environment for the cudf_udf tests on Databrcks
+# Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated.
+
+CUDF_VER=${CUDF_VER:-0.19}
+
+# Use mamba to install cudf-udf packages to speed up conda resolve time
+base=$(conda info --base)
+conda create -y -n mamba -c conda-forge mamba
+pip uninstall -y pyarrow
+${base}/envs/mamba/bin/mamba remove -y c-ares zstd libprotobuf pandas
+${base}/envs/mamba/bin/mamba install -y pyarrow=1.0.1 -c conda-forge
+${base}/envs/mamba/bin/mamba install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults cudf=$CUDF_VER cudatoolkit=10.1
+conda env remove -n mamba
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index baefe8b6da7..c65dd915110 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -33,10 +33,25 @@ sudo chmod 777 /databricks/data/logs/
 sudo chmod 777 /databricks/data/logs/*
 echo { \"port\":\"15002\" } > ~/.databricks-connect
 
+SPARK_SUBMIT_FLAGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
+    --conf spark.rapids.memory.gpu.allocFraction=0.1 \
+    --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
+    --conf spark.rapids.python.concurrentPythonWorkers=2"
+
 if [ -d "$LOCAL_JAR_PATH" ]; then
     ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
-    LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" 
+    LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks"
+
+    ## Run cudf-udf tests
+    SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \
+        bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf
 else
     ## Run tests with jars building from the spark-rapids source code
     bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"
+
+    ## Run cudf-udf tests
+    SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+    SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \
+        bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"  -m "cudf_udf" --cudf_udf
 fi

From 7b96dd63348b508bbbed782310f4058a5ce870b7 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 2 Apr 2021 16:16:06 +0800
Subject: [PATCH 2/2] Update, user 'CUDF_UDF_TEST_ARGS'

---
 jenkins/databricks/test.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index c65dd915110..95e196def39 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -33,7 +33,7 @@ sudo chmod 777 /databricks/data/logs/
 sudo chmod 777 /databricks/data/logs/*
 echo { \"port\":\"15002\" } > ~/.databricks-connect
 
-SPARK_SUBMIT_FLAGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
+CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
     --conf spark.rapids.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.concurrentPythonWorkers=2"
@@ -43,15 +43,15 @@ if [ -d "$LOCAL_JAR_PATH" ]; then
     LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks"
 
     ## Run cudf-udf tests
-    SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \
+    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \
         bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf
 else
     ## Run tests with jars building from the spark-rapids source code
     bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"
 
     ## Run cudf-udf tests
-    SPARK_SUBMIT_FLAGS="$SPARK_SUBMIT_FLAGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    SPARK_SUBMIT_FLAGS=$SPARK_SUBMIT_FLAGS TEST_PARALLEL=1 \
+    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+    SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \
         bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"  -m "cudf_udf" --cudf_udf
 fi