From 6aeb15ae58fa030fc064373043f74e5692e5eaaf Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@apache.org>
Date: Mon, 29 Jun 2020 10:07:36 -0400
Subject: [PATCH] Databricks CI improvements and support runtime env parameter
 to xfail certain tests (#297)

* Add a profile to build for Databricks

* Change xfail to be annotation

Co-authored-by: Thomas Graves <tgraves@nvidia.com>
---
 integration_tests/conftest.py                 |  3 +
 integration_tests/src/main/python/conftest.py | 14 ++++
 .../src/main/python/parquet_test.py           |  3 +
 .../src/main/python/qa_nightly_sql.py         | 19 +++--
 jenkins/Jenkinsfile.databricksnightly         | 62 ++++++--------
 jenkins/Jenkinsfile.databricksrelease         | 16 +++-
 jenkins/databricks/build.sh                   | 83 +++++++++++--------
 jenkins/databricks/dbimports.patch            | 14 ++--
 jenkins/databricks/deploy.sh                  |  2 +-
 jenkins/databricks/run-tests.py               | 57 ++++++++++---
 jenkins/databricks/shutdown.py                |  2 +-
 pom.xml                                       |  8 ++
 12 files changed, 178 insertions(+), 105 deletions(-)

diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
index 3c30b368d99..d8882de2b72 100644
--- a/integration_tests/conftest.py
+++ b/integration_tests/conftest.py
@@ -41,3 +41,6 @@ def pytest_addoption(parser):
     parser.addoption(
         "--debug_tmp_path", action='store_true', default=False, help="if true don't delete tmp_path contents for debugging"
     )
+    parser.addoption(
+        "--runtime_env", action='store', default="Apache", help="the runtime environment for the tests - apache or databricks"
+    )
diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
index 0f60e283004..506e8be8be5 100644
--- a/integration_tests/src/main/python/conftest.py
+++ b/integration_tests/src/main/python/conftest.py
@@ -48,6 +48,17 @@ def is_allowing_any_non_gpu():
 def get_non_gpu_allowed():
     return _non_gpu_allowed
 
+_runtime_env = "apache"
+
+def runtime_env():
+    return _runtime_env.lower()
+
+def is_apache_runtime():
+    return runtime_env() == "apache"
+
+def is_databricks_runtime():
+    return runtime_env() == "databricks"
+
 _limit = -1
 
 def get_limit():
@@ -112,6 +123,9 @@ def pytest_runtest_setup(item):
     else:
         _limit = -1
 
+def pytest_configure(config):
+    global _runtime_env
+    _runtime_env = config.getoption('runtime_env')
 
 def pytest_collection_modifyitems(config, items):
     for item in items:
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index ab14204e0b2..7e6509ffd09 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -15,6 +15,7 @@
 import pytest
 
 from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_collect
+from conftest import is_databricks_runtime
 from datetime import date, datetime, timezone
 from data_gen import *
 from marks import *
@@ -145,6 +146,8 @@ def test_simple_partitioned_read(spark_tmp_path):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : spark.read.parquet(data_path))
 
+@pytest.mark.xfail(condition=is_databricks_runtime(),
+    reason='https://github.com/NVIDIA/spark-rapids/issues/192')
 def test_read_merge_schema(spark_tmp_path):
     # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed 
     # we should go with a more standard set of generators
diff --git a/integration_tests/src/main/python/qa_nightly_sql.py b/integration_tests/src/main/python/qa_nightly_sql.py
index f4ca0e7cdc9..f2e143e7551 100644
--- a/integration_tests/src/main/python/qa_nightly_sql.py
+++ b/integration_tests/src/main/python/qa_nightly_sql.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from conftest import is_databricks_runtime
 import pytest
 
 SELECT_SQL = [
@@ -665,15 +666,15 @@
 ("SELECT COUNT(byteF) as count, (AVG(intF) * 5.0) as avg, (SUM(intF) + MAX(shortF * 3)) as summax FROM test_table GROUP BY intF*3", "COUNT(byteF), AVG(intF) * 5.0, SUM(intF) + MAX(shortF * 3) GROUP BY intF*3"),
 ("SELECT COUNT(*) as count, (AVG(intF) * 5.0) as avg, (SUM(intF) + MAX(shortF * 3)) as summax FROM test_table GROUP BY intF*3", "COUNT(*), AVG(intF) * 5.0, SUM(intF) + MAX(shortF * 3) GROUP BY intF*3"),
 # ("SELECT  SUM(intF) OVER (PARTITION BY byteF ORDER BY shortF) as sum_total FROM test_table", "SUM(intF) OVER (PARTITION BY byteF ORDER BY shortF) as sum_total"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num,  byteF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num, byteF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num,  shortF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num, shortF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num,  intF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num, intF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num,  longF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num, longF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num,  floatF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num, floatF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num,  booleanF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num, booleanF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num,  strF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num, strF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num,  dateF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num, dateF"),
-("SELECT  ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num,  timestampF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num, timestampF"),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num,  byteF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num, byteF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num,  shortF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num, shortF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num,  intF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num, intF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num,  longF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num, longF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num,  floatF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num, floatF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num,  booleanF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num, booleanF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num,  strF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num, strF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num,  dateF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num, dateF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
+pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num,  timestampF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num, timestampF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')),
 # ("window/row/range  (need change)", "window/row/range (need change)"),
 #("SELECT  byteF, SUM(byteF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total FROM test_table", "byteF, SUM(byteF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total"),
 #("SELECT  SUM(intF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total FROM test_table", "SUM(intF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total"),
diff --git a/jenkins/Jenkinsfile.databricksnightly b/jenkins/Jenkinsfile.databricksnightly
index d3855535a6d..d45098d6af7 100644
--- a/jenkins/Jenkinsfile.databricksnightly
+++ b/jenkins/Jenkinsfile.databricksnightly
@@ -22,7 +22,15 @@
 */
 
 pipeline {
-    agent { label 'vanilla' }
+    agent {
+        docker {
+            label 'docker-gpu'
+            image 'urm.nvidia.com/sw-spark-docker/plugin:dev-ubuntu16-cuda10.1'
+            args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \
+                -v ${HOME}/.zinc:${HOME}/.zinc:rw \
+                -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group'
+        }
+    }
 
     options {
         ansiColor('xterm')
@@ -33,6 +41,14 @@ pipeline {
     parameters {
         choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
             description: 'Where to deploy artifacts to')
+        string(name: 'DATABRICKS_VERSION',
+                defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set')
+        string(name: 'CUDF_VERSION',
+                defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use')
+        string(name: 'CUDA_VERSION',
+                defaultValue: 'cuda10-1', description: 'cuda version to use')
+        string(name: 'CLUSTER_ID',
+                defaultValue: '0617-140138-umiak14', description: 'databricks cluster id')
         string(name: 'REF', defaultValue: 'branch-0.2', description: 'Commit to build')
     }
 
@@ -42,6 +58,10 @@ pipeline {
         LIBCUDF_KERNEL_CACHE_PATH='/tmp'
         URM_CREDS = credentials("svcngcc_artifactory")
         DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
+        SCALA_VERSION = '2.12'
+        SPARK_VERSION = '3.0.0'
+        CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
+        CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
     }
 
     triggers {
@@ -50,54 +70,24 @@ pipeline {
 
     stages {
         stage('Ubuntu16 CUDA10.1') {
-            agent {
-                dockerfile {
-                    label 'docker-gpu'
-                    filename 'Dockerfile.ubuntu16'
-                    dir "jenkins"
-                    args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \
-                        -v ${HOME}/.zinc:${HOME}/.zinc:rw \
-                        -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group'
-                }
-            }
             steps {
                 script {
                     sshagent(credentials : ['svcngcc_pubpriv']) {
-                        sh "mvn versions:set -DnewVersion=0.2.0-databricks-SNAPSHOT && git clean -d -f"
+                        sh "mvn versions:set -DnewVersion=$DATABRICKS_VERSION && git clean -d -f"
+                        sh "mvn dependency:get -Dartifact=ai.rapids:cudf:$CUDF_VERSION -Ddest=./"
                         sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
-                        sh "tar -zcvf spark-rapids-ci.tgz * || true"
-                        sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh"
+                        sh "tar -zcvf spark-rapids-ci.tgz *"
+                        sh "python3.6 ./jenkins/databricks/run-tests.py -c $CLUSTER_ID -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
                         sh "./jenkins/databricks/deploy.sh"
                     }
                 }
             }
         }
-        stage('cleanup') {
-            agent {
-                dockerfile {
-                    label 'docker-gpu'
-                    filename 'Dockerfile.ubuntu16'
-                    dir "jenkins"
-                    args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \
-                        -v ${HOME}/.zinc:${HOME}/.zinc:rw \
-                        -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group'
-                }
-            }
-            steps {
-                script {
-                    sh "python3.6 ./jenkins/databricks/shutdown.py -t $DATABRICKS_TOKEN"
-                }
-            }
-        }
     } // end of stages
     post {
         always {
             script {
-                if (currentBuild.currentResult == "SUCCESS") {
-                    slack("#rapidsai-spark-cicd", "Success", color: "#33CC33")
-                } else {
-                    slack("#rapidsai-spark-cicd", "Failed", color: "#FF0000")
-                }
+                sh "python3.6 ./jenkins/databricks/shutdown.py -c $CLUSTER_ID -t $DATABRICKS_TOKEN || true"
             }
         }
     }
diff --git a/jenkins/Jenkinsfile.databricksrelease b/jenkins/Jenkinsfile.databricksrelease
index 74d1ddeb840..c021f2e0c87 100644
--- a/jenkins/Jenkinsfile.databricksrelease
+++ b/jenkins/Jenkinsfile.databricksrelease
@@ -28,7 +28,7 @@ def SERVERS_MAP = [
 
 def SEC_IDS = [
     Local:    ['local-gpg-passphrase',  'local-gpg-private-key',  'local-username-password'],
-    Sonatype: ['rapids-gpg-passphrase', 'rapids-gpg-private-key', 'sonatype-username-password']
+    Sonatype: ['SPARK_RAPIDS_GPG_PASSPHRASE', 'SPARK_RAPIDS_GPG_PRIVATE_KEY', 'SPARK_SONATYPE_USERPASS']
 ]
 
 pipeline {
@@ -52,6 +52,12 @@ pipeline {
     parameters {
         choice(name: 'DEPLOY_TO', choices: ['Sonatype'],
             description: 'Where to deploy artifacts to')
+        string(name: 'DATABRICKS_VERSION',
+                defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set')
+        string(name: 'CUDF_VERSION',
+                defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use')
+        string(name: 'CUDA_VERSION',
+                defaultValue: 'cuda10-1', description: 'cuda version to use')
         string(name: 'REF', defaultValue: 'branch-0.2', description: 'Commit to build')
     }
 
@@ -64,6 +70,10 @@ pipeline {
         DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
         DIST_PL='dist'
         SQL_PL='sql-plugin'
+        SCALA_VERSION = '2.12'
+        SPARK_VERSION = '3.0.0'
+        CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
+        CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
     }
 
     stages {
@@ -71,10 +81,10 @@ pipeline {
             steps {
                 script {
                     sshagent(credentials : ['svcngcc_pubpriv']) {
-                        sh "mvn versions:set -DnewVersion=0.1.0-databricks && git clean -d -f"
+                        sh "mvn versions:set -DnewVersion=0.2.0-databricks && git clean -d -f"
                         sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
                         sh "tar -zcvf spark-rapids-ci.tgz * || true"
-                        sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh"
+                        sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
                     }
                 }
             }
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index 5f89c25f411..0038557e83d 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -17,85 +17,98 @@
 
 set -e
 
-SPARKTGZ=/home/ubuntu/spark-rapids-ci.tgz
-if [ "$1" != "" ]; then
-  SPARKTGZ=$1
-fi
+SPARKTGZ=$1
+DATABRICKS_VERSION=$2
+SCALA_VERSION=$3
+CI_RAPIDS_JAR=$4
+SPARK_VERSION=$5
+CUDF_VERSION=$6
+CUDA_VERSION=$7
+CI_CUDF_JAR=$8
+
+echo "Spark version is $SPARK_VERSION"
+echo "scala version is: $SCALA_VERSION"
+
+# this has to match the Databricks init script
+DB_JAR_LOC=/databricks/jars
+DB_RAPIDS_JAR_LOC=$DB_JAR_LOC/$CI_RAPIDS_JAR
+DB_CUDF_JAR_LOC=$DB_JAR_LOC/$CI_CUDF_JAR
+RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar
 
 sudo apt install -y maven
 rm -rf spark-rapids
 mkdir spark-rapids
 tar -zxvf $SPARKTGZ -C spark-rapids
 cd spark-rapids
-# pull 3.0.0 artifacts and ignore errors then install databricks jars, then build again
 mvn clean package || true
 M2DIR=/home/ubuntu/.m2/repository
+CUDF_JAR=./cudf-${CUDF_VERSION}.jar
+mvn install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=./$CUDF_JAR \
+   -DgroupId=ai.rapids \
+   -DartifactId=cudf \
+   -Dversion=$CUDF_VERSION \
+   -Dclassifier=$CUDA_VERSION \
+   -Dpackaging=jar
+
+# pull normal Spark artifacts and ignore errors then install databricks jars, then build again
 JARDIR=/databricks/jars
-SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_2.12_deploy.jar
-CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_2.12_deploy.jar
-ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_2.12_deploy.jar
-COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_2.12_deploy.jar
-VERSIONJAR=----workspace_spark_3_0--core--libcore_generated_resources.jar
-VERSION=3.0.0
+SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
+CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
+ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
+COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
 mvn install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$COREJAR \
    -DgroupId=org.apache.spark \
-   -DartifactId=spark-core_2.12 \
-   -Dversion=$VERSION \
+   -DartifactId=spark-core_$SCALA_VERSION \
+   -Dversion=$SPARK_VERSION \
    -Dpackaging=jar
 
 mvn install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$CATALYSTJAR \
    -DgroupId=org.apache.spark \
-   -DartifactId=spark-catalyst_2.12 \
-   -Dversion=$VERSION \
+   -DartifactId=spark-catalyst_$SCALA_VERSION \
+   -Dversion=$SPARK_VERSION \
    -Dpackaging=jar
 
 mvn install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$SQLJAR \
    -DgroupId=org.apache.spark \
-   -DartifactId=spark-sql_2.12 \
-   -Dversion=$VERSION \
+   -DartifactId=spark-sql_$SCALA_VERSION \
+   -Dversion=$SPARK_VERSION \
    -Dpackaging=jar
 
 mvn install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$ANNOTJAR \
    -DgroupId=org.apache.spark \
-   -DartifactId=spark-annotation_2.12 \
-   -Dversion=$VERSION \
+   -DartifactId=spark-annotation_$SCALA_VERSION \
+   -Dversion=$SPARK_VERSION \
    -Dpackaging=jar
 
-mvn install:install-file \
-   -Dmaven.repo.local=$M2DIR \
-   -Dfile=$JARDIR/$VERSIONJAR \
-   -DgroupId=org.apache.spark \
-   -DartifactId=spark-version_2.12 \
-   -Dversion=$VERSION \
-   -Dpackaging=jar
-
-mvn -Pdatabricks clean verify -DskipTests
+mvn -Pdatabricks clean package -DskipTests
 
-# Copy so we pick up new built jar. Note that the jar name rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar has to be
-# exactly that because its based on the staticly setup Databricks cluster we use. That cluster specifically
-# installs the jar with the name rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar. Do not change that name
-# without changing the Databricks cluster setup.
-sudo cp dist/target/rapids-4-spark_2.12-*-SNAPSHOT.jar /databricks/jars/rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar
+# Copy so we pick up new built jar and latesty CuDF jar. Note that the jar names has to be
+# exactly what is in the staticly setup Databricks cluster we use. 
+sudo cp dist/target/$RAPIDS_BUILT_JAR $DB_RAPIDS_JAR_LOC
+sudo cp ./$CUDF_JAR $DB_CUDF_JAR_LOC
 
 # tests
 export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH
 sudo /databricks/conda/envs/databricks-ml-gpu/bin/pip install pytest sre_yield
 cd /home/ubuntu/spark-rapids/integration_tests
 export SPARK_HOME=/databricks/spark
+# change to not point at databricks confs so we don't conflict with their settings
+export SPARK_CONF_DIR=$PWD
 export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
 sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
 sudo chmod 777 /databricks/data/logs/
 sudo chmod 777 /databricks/data/logs/*
 echo { \"port\":\"15002\" } > ~/.databricks-connect
-$SPARK_HOME/bin/spark-submit ./runtests.py 2>&1 | tee out
-
+$SPARK_HOME/bin/spark-submit ./runtests.py --runtime_env="databricks"
 cd /home/ubuntu
 tar -zcvf spark-rapids-built.tgz spark-rapids
diff --git a/jenkins/databricks/dbimports.patch b/jenkins/databricks/dbimports.patch
index e1b179f49bb..d112e85ec7b 100644
--- a/jenkins/databricks/dbimports.patch
+++ b/jenkins/databricks/dbimports.patch
@@ -1,20 +1,20 @@
 diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala
-index e6290d1..6ceb47d 100644
+index e6c3e37..ddd8ca4 100644
 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala
 +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala
-@@ -18,8 +18,9 @@ package com.nvidia.spark.rapids
- import ai.rapids.cudf.{NvtxColor, Table}
+@@ -19,8 +19,9 @@ import ai.rapids.cudf.{NvtxColor, Table}
  
+ import org.apache.spark.TaskContext
  import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight}
- import org.apache.spark.sql.catalyst.plans.{Inner, JoinType, LeftAnti, LeftOuter, LeftSemi}
+ import org.apache.spark.sql.catalyst.plans.{ExistenceJoin, FullOuter, Inner, InnerLike, JoinType, LeftAnti, LeftExistence, LeftOuter, LeftSemi, RightOuter}
 -import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, HashJoin}
 +import org.apache.spark.sql.execution.joins.HashJoin
  import org.apache.spark.sql.execution.metric.SQLMetric
  import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
  
 diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
-index 5be8d1a..6b4a58e 100644
+index 7ae310b..3ebde77 100644
 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
 +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
 @@ -22,10 +22,11 @@ import org.apache.spark.TaskContext
@@ -31,7 +31,7 @@ index 5be8d1a..6b4a58e 100644
  import org.apache.spark.sql.vectorized.ColumnarBatch
  
 diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala
-index 3b2b447..e9e4051 100644
+index 29ba63d..78febd4 100644
 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala
 +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala
 @@ -17,8 +17,9 @@
@@ -65,7 +65,7 @@ index b02182a..1ed13d2 100644
  
  trait ConfKeysAndIncompat {
 diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
-index 7be40e8..13c8500 100644
+index ac444d1..14a8c6e 100644
 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
 +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
 @@ -22,6 +22,7 @@ import com.nvidia.spark.rapids.GpuMetricNames._
diff --git a/jenkins/databricks/deploy.sh b/jenkins/databricks/deploy.sh
index c783565ffab..04fec64456a 100755
--- a/jenkins/databricks/deploy.sh
+++ b/jenkins/databricks/deploy.sh
@@ -24,6 +24,6 @@ cd spark-rapids
 echo "Maven mirror is $MVN_URM_MIRROR"
 SERVER_ID='snapshots'
 SERVER_URL='https://urm.nvidia.com:443/artifactory/sw-spark-maven-local'
-FPATH=./dist/target/rapids-4-spark_2.12-0.2.0-databricks-SNAPSHOT.jar
+FPATH=./dist/target/rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar
 mvn -B deploy:deploy-file $MVN_URM_MIRROR -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
     -Dfile=$FPATH -DpomFile=dist/pom.xml 
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
index 14ab038c420..975cbe8601d 100644
--- a/jenkins/databricks/run-tests.py
+++ b/jenkins/databricks/run-tests.py
@@ -17,6 +17,7 @@
 import getopt
 import time
 import os
+import subprocess
 
 def cluster_state(workspace, clusterid, token):
   clusterresp = requests.get(workspace + "/api/2.0/clusters/get?cluster_id=%s" % clusterid, headers={'Authorization': 'Bearer %s' % token})
@@ -40,23 +41,30 @@ def main():
   clusterid = '0617-140138-umiak14'
   private_key_file = "~/.ssh/id_rsa"
   skip_start = None
-  local_script = "build.sh"
-  script_dest = "/home/ubuntu/build.sh"
-  source_tgz = "spark-rapids-ci.tgz"
-  tgz_dest = "/home/ubuntu/spark-rapids-ci.tgz"
+  local_script = 'build.sh'
+  script_dest = '/home/ubuntu/build.sh'
+  source_tgz = 'spark-rapids-ci.tgz'
+  tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz'
+  ci_rapids_jar = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
+  db_version = '0.1-databricks-SNAPSHOT'
+  scala_version = '2.12'
+  spark_version = '3.0.0'
+  cudf_version = '0.15-SNAPSHOT'
+  cuda_version = 'cuda10-1'
+  ci_cudf_jar = 'cudf-0.14-cuda10-1.jar'
 
   try:
-      opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:',
-                                 ['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz='])
+      opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:j:b:k:a:f:u:m:',
+                                 ['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=', 'cirapidsjar=', 'databricksversion=', 'sparkversion=', 'scalaversion=', 'cudfversion=', 'cudaversion=', 'cicudfjar='])
   except getopt.GetoptError:
       print(
-          'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -z <scriptdestinatino> -z <sparktgz>')
+          'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar>')
       sys.exit(2)
 
   for opt, arg in opts:
       if opt == '-h':
           print(
-              'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz>')
+              'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar>')
           sys.exit()
       elif opt in ('-s', '--workspace'):
           workspace = arg
@@ -74,6 +82,20 @@ def main():
           script_dest = arg
       elif opt in ('-z', '--sparktgz'):
           source_tgz = arg
+      elif opt in ('-j', '--cirapidsjar'):
+          ci_rapids_jar = arg
+      elif opt in ('-b', '--databricksversion'):
+          db_version = arg
+      elif opt in ('-k', '--sparkversion'):
+          spark_version = arg
+      elif opt in ('-a', '--scalaversion'):
+          scala_version = arg
+      elif opt in ('-f', '--cudfversion'):
+          cudf_version = arg
+      elif opt in ('-u', '--cudaversion'):
+          cuda_version = arg
+      elif opt in ('-m', '--cicudfjar'):
+          ci_cudf_jar = arg
 
   print('-s is ' + workspace)
   print('-c is ' + clusterid)
@@ -84,6 +106,14 @@ def main():
       print("-n: don't skip start")
   print('-l is ' + local_script)
   print('-d is ' + script_dest)
+  print('-z is ' + source_tgz)
+  print('-j is ' + ci_rapids_jar)
+  print('-b is ' + db_version)
+  print('-k is ' + spark_version)
+  print('-a is ' + scala_version)
+  print('-f is ' + cudf_version)
+  print('-u is ' + cuda_version)
+  print('-m is ' + ci_cudf_jar)
 
   if skip_start is None:
       jsonout = cluster_state(workspace, clusterid, token)
@@ -124,20 +154,21 @@ def main():
   print("Copying script")
   rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, local_script, master_addr, script_dest)
   print("rsync command: %s" % rsync_command)
-  os.system(rsync_command)
+  subprocess.check_call(rsync_command, shell = True)
 
   print("Copying source")
   rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, source_tgz, master_addr, tgz_dest)
   print("rsync command: %s" % rsync_command)
-  os.system(rsync_command)
-  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s 2>&1 | tee buildout" % (master_addr, private_key_file, script_dest, tgz_dest)
+  subprocess.check_call(rsync_command, shell = True)
+
+  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, db_version, scala_version, ci_rapids_jar, spark_version, cudf_version, cuda_version, ci_cudf_jar)
   print("ssh command: %s" % ssh_command)
-  os.system(ssh_command)
+  subprocess.check_call(ssh_command, shell = True)
 
   print("Copying built tarball back")
   rsync_command = "rsync  -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (private_key_file, master_addr)
   print("rsync command to get built tarball: %s" % rsync_command)
-  os.system(rsync_command)
+  subprocess.check_call(rsync_command, shell = True)
 
 if __name__ == '__main__':
   main()
diff --git a/jenkins/databricks/shutdown.py b/jenkins/databricks/shutdown.py
index 330ecbe9107..1ee91af16ed 100644
--- a/jenkins/databricks/shutdown.py
+++ b/jenkins/databricks/shutdown.py
@@ -31,7 +31,7 @@ def main():
   clusterid = '0617-140138-umiak14'
 
   try:
-      opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c',
+      opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:',
                                  ['workspace=', 'token=', 'clusterid='])
   except getopt.GetoptError:
       print(
diff --git a/pom.xml b/pom.xml
index 5949a616f94..1cdb786df50 100644
--- a/pom.xml
+++ b/pom.xml
@@ -124,6 +124,9 @@
         </profile>
         <profile>
             <id>databricks</id>
+            <properties>
+	        <rat.consoleOutput>true</rat.consoleOutput>
+            </properties>
         </profile>
     </profiles>
 
@@ -148,6 +151,7 @@
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 	<project.reporting.sourceEncoding>UTF-8</project.reporting.sourceEncoding>
 	<pytest.TEST_TAGS>not qarun</pytest.TEST_TAGS>
+	<rat.consoleOutput>false</rat.consoleOutput>
     </properties>
 
     <dependencyManagement>
@@ -420,6 +424,9 @@
                     <groupId>org.apache.rat</groupId>
                     <artifactId>apache-rat-plugin</artifactId>
                     <version>0.13</version>
+                    <configuration>
+			    <consoleOutput>${rat.consoleOutput}</consoleOutput>
+                    </configuration>
                     <executions>
                         <execution>
                             <phase>verify</phase>
@@ -473,6 +480,7 @@
                         <exclude>.gnupg/**</exclude>
                         <exclude>pom.xml.asc</exclude>
 			<exclude>jenkins/databricks/*.patch</exclude>
+			<exclude>*.jar</exclude>
                     </excludes>
                 </configuration>
             </plugin>