NVIDIA · jlowe · Nov 7, 2020 · Oct 8, 2020 · Oct 8, 2020 · Oct 8, 2020
diff --git a/docs/FAQ.md b/docs/FAQ.md
@@ -29,6 +29,7 @@ top of these changes and release updates as quickly as possible.
 The RAPIDS Accelerator for Apache Spark officially supports
 [Apache Spark](get-started/getting-started-on-prem.md),
 [Databricks Runtime 7.0](get-started/getting-started-databricks.md)
+[Databricks Runtime 7.3](get-started/getting-started-databricks.md)
 and [Google Cloud Dataproc](get-started/getting-started-gcp.md).
 Most distributions based off of Apache Spark 3.0.0 should work, but because the plugin replaces
 parts of the physical plan that Apache Spark considers to be internal the code for those plans

diff --git a/docs/get-started/getting-started-databricks.md b/docs/get-started/getting-started-databricks.md
@@ -9,15 +9,15 @@ parent: Getting-Started
 This guide will run through how to set up the RAPIDS Accelerator for Apache Spark 3.0 on Databricks.  At the end of this guide, the reader will be able to run a sample Apache Spark application that runs on NVIDIA GPUs on Databricks.
 
 ## Prerequisites
-    * Apache Spark 3.0 running in DataBricks Runtime 7.0 ML with GPU 
-    * AWS: 7.0 ML (includes Apache Spark 3.0.0, GPU, Scala 2.12)
-    * Azure: 7.0 ML (GPU, Scala 2.12, Spark 3.0.0)
+    * Apache Spark 3.0 running in DataBricks Runtime 7.0 ML with GPU or Runtime 7.3 ML with GPU
+    * AWS: 7.0 ML (includes Apache Spark 3.0.0, GPU, Scala 2.12) or 7.3 LTS ML (includes Apache Spark 3.0.1, GPU, Scala 2.12)
+    * Azure: 7.0 ML (GPU, Scala 2.12, Spark 3.0.0) or 7.3 LTS ML (GPU, Scala 2.12, Spark 3.0.1)
 
 The number of GPUs per node dictates the number of Spark executors that can run in that node.
 
 ## Start a Databricks Cluster
 Create a Databricks cluster by going to Clusters, then clicking “+ Create Cluster”. Ensure the cluster meets the prerequisites above by configuring it as follows:
-1. On AWS, make sure to use 7.0 ML (GPU, Scala 2.12, Spark 3.0.0), or for Azure, choose 7.0 ML (GPU, Scala 2.12, Spark 3.0.0).
+1. Select the DataBricks Runtime Version from one of the supported runtimes specified in the Prerequisites section.
 2. Under Autopilot Options, disable auto scaling.
 3. Choose the number of workers that matches the number of GPUs you want to use.
 4. Select a worker type.  On AWS, use nodes with 1 GPU each such as `p3.xlarge` or `g4dn.xlarge`.  p2 nodes do not meet the architecture requirements for the Spark worker (although they can be used for the driver node).  For Azure, choose GPU nodes such as Standard_NC6s_v3. 

diff --git a/integration_tests/src/main/python/spark_init_internal.py b/integration_tests/src/main/python/spark_init_internal.py
@@ -20,9 +20,12 @@ def _spark__init():
     # due to bugs in pyspark/pytest it looks like any configs set here
     # can be reset in the middle of a test if specific operations are done (some types of cast etc)
     # enableHiveSupport() is needed for parquet bucket tests
+    # disable adaptive query execution by default because some CSPs have it on by default and we don't
+    # support everywhere
     _s = SparkSession.builder \
             .config('spark.plugins', 'com.nvidia.spark.SQLPlugin') \
             .config('spark.sql.queryExecutionListeners', 'com.nvidia.spark.rapids.ExecutionPlanCaptureCallback')\
+            .config("spark.sql.adaptive.enabled", "false") \
             .enableHiveSupport() \
             .appName('rapids spark plugin integration tests (python)').getOrCreate()
     #TODO catch the ClassNotFound error that happens if the classpath is not set up properly and

@@ -0,0 +1,116 @@
+#!/usr/local/env groovy
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+*
+* Jenkinsfile for building rapids-plugin on Databricks based on Spark 3.0.1
+*
+*/
+@Library(['shared-libs', 'spark-jenkins-shared-lib']) _
+
+def urmUrl="https://${ArtifactoryConstants.ARTIFACTORY_NAME}/artifactory/sw-spark-maven"
+
+pipeline {
+    agent {
+        docker {
+            label 'docker-gpu'
+            image "${ArtifactoryConstants.ARTIFACTORY_NAME}/sw-spark-docker/plugin:dev-ubuntu16-cuda10.1"
+            args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \
+                -v ${HOME}/.zinc:${HOME}/.zinc:rw'
+        }
+    }
+
+    options {
+        ansiColor('xterm')
+        // timeout doesn't seem to work with environment variables so make sure to update below
+        // IDLE_TIMEOUT config as well
+        timeout(time: 240, unit: 'MINUTES')
+        buildDiscarder(logRotator(numToKeepStr: '10'))
+    }
+
+    parameters {
+        choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
+            description: 'Where to deploy artifacts to')
+        string(name: 'RUNTIME',
+                defaultValue: '7.3.x-gpu-ml-scala2.12', description: 'databricks runtime')
+        string(name: 'PUBLIC_KEY',
+                defaultValue: '\"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDB+ValakyoKn7w+iBRoAi1KlLVH4yVmRXhLCZs1qUECBAhbck2o8Lgjp5wJ+epdT3+EAP2+t/zlK1mU9tTylidapR4fZuIk9ApoQSLOUEXcUtHkPpZulIoGAq78HoyiEs1sKovc6ULvpymjdnQ3ogCZlTlP9uqmL2E4kbtrNCNL0SVj/w10AqzrJ5lqQgO5dIdDRMHW2sv88JI1VLlfiSsofa9RdI7hDRuCnfZ0+dv2URJGGzGt2BkdEmk9t5F1BMpuXvZ8HzOYdACzw0U+THBOk9d4CACUYMyO1XqlXwoYweNKRnigXDCRaTWGFBzTkugRuW/BZBccTR1ON430uRB svcngcc@nvidia.com\"', description: 'public key')
+        string(name: 'REF', defaultValue: 'branch-0.3', description: 'Commit to build')
+        string(name: 'BASE_SPARK_VERSION',
+                defaultValue: '3.0.1', description: 'Databricks base Spark version')
+        string(name: 'BUILD_PROFILES',
+                defaultValue: 'databricks301,!snapshot-shims', description: 'the mvn build profiles to use when building Databricks')
+    }
+
+    environment {
+        IDLE_TIMEOUT = 240
+        JENKINS_ROOT  = 'jenkins'
+        MVN_URM_MIRROR='-s jenkins/settings.xml -P mirror-apache-to-urm'
+        LIBCUDF_KERNEL_CACHE_PATH='/tmp'
+        URM_CREDS = credentials("svcngcc_artifactory")
+        DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
+        URM_URL = "${urmUrl}"
+    }
+
+    triggers {
+        cron('H 5 * * *')
+    }
+
+    stages {
+        stage('Ubuntu16 CUDA10.1') {
+            steps {
+                script {
+                    sshagent(credentials : ['svcngcc_pubpriv']) {
+                        sh "rm -rf spark-rapids-ci.tgz"
+                        sh "tar -zcvf spark-rapids-ci.tgz *"
+                        env.CLUSTERID = sh (
+                            script: "python3.6 ./jenkins/databricks/create.py -t $DATABRICKS_TOKEN -k $PUBLIC_KEY -r $RUNTIME -i $IDLE_TIMEOUT -n CI-GPU-databricks-${BASE_SPARK_VERSION}",
+                            returnStdout: true
+                        ).trim()
+                        sh "python3.6 ./jenkins/databricks/run-tests.py -c ${env.CLUSTERID} -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -v $BASE_SPARK_VERSION -b $BUILD_PROFILES"
+                        sh "./jenkins/databricks/deploy.sh"
+                    }
+                }
+            }
+        }
+    } // end of stages
+    post {
+        always {
+            script {
+                sh "python3.6 ./jenkins/databricks/shutdown.py -c ${env.CLUSTERID} -t $DATABRICKS_TOKEN -d || true"
+                if (currentBuild.currentResult == "SUCCESS") {
+                    slack("#swrapids-spark-cicd", "Success", color: "#33CC33")
+                } else {
+                    slack("#swrapids-spark-cicd", "Failed", color: "#FF0000")
+                }
+            }
+        }
+    }
+} // end of pipeline
+
+void slack(Map params = [:], String channel, String message) {
+    Map defaultParams = [
+            color: "#000000",
+            baseUrl: "${SparkConstants.SLACK_API_ENDPOINT}",
+            tokenCredentialId: "slack_token"
+    ]
+
+    params["channel"] = channel
+    params["message"] = "${BUILD_URL}\n" + message
+
+    slackSend(defaultParams << params)
+}
@@ -45,17 +45,15 @@ pipeline {
     parameters {
         choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
             description: 'Where to deploy artifacts to')
-        string(name: 'DATABRICKS_VERSION',
-                defaultValue: '0.3.0-SNAPSHOT', description: 'Version to use for databricks jar produced')
-        string(name: 'CUDF_VERSION',
-                defaultValue: '0.17-SNAPSHOT', description: 'Cudf version to use')
-        string(name: 'CUDA_VERSION',
-                defaultValue: 'cuda10-1', description: 'cuda version to use')
         string(name: 'RUNTIME',
                 defaultValue: '7.0.x-gpu-ml-scala2.12', description: 'databricks runtime')
         string(name: 'PUBLIC_KEY',
                 defaultValue: '\"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDB+ValakyoKn7w+iBRoAi1KlLVH4yVmRXhLCZs1qUECBAhbck2o8Lgjp5wJ+epdT3+EAP2+t/zlK1mU9tTylidapR4fZuIk9ApoQSLOUEXcUtHkPpZulIoGAq78HoyiEs1sKovc6ULvpymjdnQ3ogCZlTlP9uqmL2E4kbtrNCNL0SVj/w10AqzrJ5lqQgO5dIdDRMHW2sv88JI1VLlfiSsofa9RdI7hDRuCnfZ0+dv2URJGGzGt2BkdEmk9t5F1BMpuXvZ8HzOYdACzw0U+THBOk9d4CACUYMyO1XqlXwoYweNKRnigXDCRaTWGFBzTkugRuW/BZBccTR1ON430uRB svcngcc@nvidia.com\"', description: 'public key')
         string(name: 'REF', defaultValue: 'branch-0.3', description: 'Commit to build')
+        string(name: 'BASE_SPARK_VERSION',
+                defaultValue: '3.0.0', description: 'Databricks base Spark version')
+        string(name: 'BUILD_PROFILES',
+                defaultValue: 'databricks,!snapshot-shims', description: 'the mvn build profiles to use when building Databricks')
     }
 
     environment {
@@ -65,11 +63,6 @@ pipeline {
         LIBCUDF_KERNEL_CACHE_PATH='/tmp'
         URM_CREDS = credentials("svcngcc_artifactory")
         DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
-        SCALA_VERSION = '2.12'
-        // the spark version used when we install databricks jars into .m2 directory
-        SPARK_VERSION_TO_INSTALL_JARS = '3.0.0-databricks'
-        CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
-        CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
         URM_URL = "${urmUrl}"
     }
 
@@ -85,10 +78,10 @@ pipeline {
                         sh "rm -rf spark-rapids-ci.tgz"
                         sh "tar -zcvf spark-rapids-ci.tgz *"
                         env.CLUSTERID = sh (
-                            script: "python3.6 ./jenkins/databricks/create.py -t $DATABRICKS_TOKEN -k $PUBLIC_KEY -r $RUNTIME -i $IDLE_TIMEOUT -n CI-GPU-databricks-${DATABRICKS_VERSION}",
+                            script: "python3.6 ./jenkins/databricks/create.py -t $DATABRICKS_TOKEN -k $PUBLIC_KEY -r $RUNTIME -i $IDLE_TIMEOUT -n CI-GPU-databricks-${BASE_SPARK_VERSION}",
                             returnStdout: true
                         ).trim()
-                        sh "python3.6 ./jenkins/databricks/run-tests.py -c ${env.CLUSTERID} -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION_TO_INSTALL_JARS -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
+                        sh "python3.6 ./jenkins/databricks/run-tests.py -c ${env.CLUSTERID} -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -v $BASE_SPARK_VERSION -b $BUILD_PROFILES"
                         sh "./jenkins/databricks/deploy.sh"
                     }
                 }

@@ -18,32 +18,37 @@
 set -e
 
 SPARKSRCTGZ=$1
-# this should match whatever is in the pom files for the version
-SPARK_PLUGIN_JAR_VERSION=$2
-SCALA_VERSION=$3
-CI_RAPIDS_JAR=$4
-# the version of spark used when we install the databricks jars in .m2
-SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=$5
-CUDF_VERSION=$6
-CUDA_VERSION=$7
-CI_CUDF_JAR=$8
 # version of Apache Spark we are building against
-BASE_SPARK_POM_VERSION=$9
+BASE_SPARK_VERSION=$2
+BUILD_PROFILES=$3
+
+echo "tgz is $SPARKSRCTGZ"
+echo "Base Spark version is $BASE_SPARK_VERSION"
+echo "build profiles $BUILD_PROFILES"
 
-echo "Spark version is $SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS"
-echo "scala version is: $SCALA_VERSION"
+sudo apt install -y maven
 
 # this has to match the Databricks init script
 DB_JAR_LOC=/databricks/jars/
-RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar
 
-sudo apt install -y maven
 rm -rf spark-rapids
 mkdir spark-rapids
+echo  "tar -zxvf $SPARKSRCTGZ -C spark-rapids"
 tar -zxvf $SPARKSRCTGZ -C spark-rapids
 cd spark-rapids
 export WORKSPACE=`pwd`
-mvn -B '-Pdatabricks,!snapshot-shims' clean package -DskipTests || true
+
+SPARK_PLUGIN_JAR_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=project.version -DforceStdout`
+CUDF_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=cudf.version -DforceStdout`
+SCALA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=scala.binary.version -DforceStdout`
+CUDA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=cuda.version -DforceStdout`
+
+# the version of spark used when we install the databricks jars in .m2
+SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=$BASE_SPARK_VERSION-databricks
+RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar
+
+echo "Scala version is: $SCALA_VERSION"
+mvn -B -P${BUILD_PROFILES} clean package -DskipTests || true
 # export 'M2DIR' so that shims can get the correct cudf/spark dependnecy info
 export M2DIR=/home/ubuntu/.m2/repository
 CUDF_JAR=${M2DIR}/ai/rapids/cudf/${CUDF_VERSION}/cudf-${CUDF_VERSION}-${CUDA_VERSION}.jar
@@ -55,8 +60,8 @@ CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.
 ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
 COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
 # install the 3.0.0 pom file so we get dependencies
-COREPOM=spark-core_${SCALA_VERSION}-${BASE_SPARK_POM_VERSION}.pom
-COREPOMPATH=$M2DIR/org/apache/spark/spark-core_${SCALA_VERSION}/${BASE_SPARK_POM_VERSION}
+COREPOM=spark-core_${SCALA_VERSION}-${BASE_SPARK_VERSION}.pom
+COREPOMPATH=$M2DIR/org/apache/spark/spark-core_${SCALA_VERSION}/${BASE_SPARK_VERSION}
 mvn -B install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$COREJAR \
@@ -90,8 +95,7 @@ mvn -B install:install-file \
    -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar
 
-mvn -B '-Pdatabricks,!snapshot-shims' clean package -DskipTests
-
+mvn -B -P${BUILD_PROFILES} clean package -DskipTests
 
 # Copy so we pick up new built jar and latesty CuDF jar. Note that the jar names has to be
 # exactly what is in the staticly setup Databricks cluster we use. 

@@ -24,6 +24,13 @@ cd spark-rapids
 echo "Maven mirror is $MVN_URM_MIRROR"
 SERVER_ID='snapshots'
 SERVER_URL="$URM_URL-local"
-DBJARFPATH=./shims/spark300db/target/rapids-4-spark-shims-spark300-databricks_$SCALA_VERSION-$DATABRICKS_VERSION.jar
+SCALA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=scala.binary.version -DforceStdout`
+# remove the periods so change something like 3.0.0 to 300
+VERSION_NUM=${BASE_SPARK_VERSION//.}
+SPARK_VERSION_STR=spark$VERSION_NUM
+SPARK_PLUGIN_JAR_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=project.version -DforceStdout`
+DB_SHIM_DIRECTORY=${SPARK_VERSION_STR}db
+DBJARFPATH=./shims/${DB_SHIM_DIRECTORY}/target/rapids-4-spark-shims-$SPARK_VERSION_STR-databricks_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar
+echo "Databricks jar is: $DBJARFPATH"
 mvn -B deploy:deploy-file $MVN_URM_MIRROR '-P!snapshot-shims' -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
-    -Dfile=$DBJARFPATH -DpomFile=shims/spark300db/pom.xml
+    -Dfile=$DBJARFPATH -DpomFile=shims/${DB_SHIM_DIRECTORY}/pom.xml