Change databricks build to dynamically create a cluster (NVIDIA#981)

* Add some more checks to databricks build scripts Signed-off-by: Thomas Graves <tgraves@nvidia.com> * remove extra newline * use the right -gt for bash * Add new python file for databricks cluster utils * Fix up scripts * databricks scripts working Signed-off-by: Thomas Graves <tgraves@nvidia.com> * Pass in sshkey Signed-off-by: Thomas Graves <tgraves@nvidia.com> * cluster creation script mods * fix * fix pub key * fix missing quote * fix $ * update public key to be param Signed-off-by: Thomas Graves <tgraves@nvidia.com> * Add public key value * clenaup Signed-off-by: Thomas Graves <tgraves@nvidia.com> * modify permissions Signed-off-by: Thomas Graves <tgraves@nvidia.com> * change loc cluster id file * fix extra / * quote public key * try different setting cluster id * debug * try again * try readfile * try again * try quotes * cleanup * Add option to control number of partitions when converting from CSV to Parquet (NVIDIA#915) * Add command-line arguments for applying coalesce and repartition on a per-table basis Signed-off-by: Andy Grove <andygrove@nvidia.com> * Move command-line validation logic and address other feedback Signed-off-by: Andy Grove <andygrove@nvidia.com> * Update copyright years and fix import order Signed-off-by: Andy Grove <andygrove@nvidia.com> * Update docs/benchmarks.md Co-authored-by: Jason Lowe <jlowe@nvidia.com> * Remove withPartitioning option from TPC-H and TPC-xBB file conversion Signed-off-by: Andy Grove <andygrove@nvidia.com> Co-authored-by: Jason Lowe <jlowe@nvidia.com> * Benchmark runner script (NVIDIA#918) * Benchmark runner script Signed-off-by: Andy Grove <andygrove@nvidia.com> * Add argument for number of iterations Signed-off-by: Andy Grove <andygrove@nvidia.com> * Fix docs Signed-off-by: Andy Grove <andygrove@nvidia.com> * add license Signed-off-by: Andy Grove <andygrove@nvidia.com> * improve documentation for the configuration files Signed-off-by: Andy Grove <andygrove@nvidia.com> * Add missing line-continuation symbol in example Signed-off-by: Andy Grove <andygrove@nvidia.com> * Remove hard-coded spark-submit-template.txt and add --template argument. Also make all arguments required. Signed-off-by: Andy Grove <andygrove@nvidia.com> * Update benchmarking guide to link to the benchmark python script Signed-off-by: Andy Grove <andygrove@nvidia.com> * Add --template to example and fix markdown header Signed-off-by: Andy Grove <andygrove@nvidia.com> * Add legacy config to clear active Spark 3.1.0 session in tests (NVIDIA#970) Signed-off-by: Jason Lowe <jlowe@nvidia.com> * XFail tests until final fix can be put in (NVIDIA#968) Signed-off-by: Robert (Bobby) Evans <bobby@apache.org> * Stop reporting totalTime metric for GpuShuffleExchangeExec (NVIDIA#973) Signed-off-by: Andy Grove <andygrove@nvidia.com> * Add some more checks to databricks build scripts Signed-off-by: Thomas Graves <tgraves@nvidia.com> * Pass in sshkey * Add create script, add more parameters, etc Signed-off-by: Thomas Graves <tgraves@nvidia.com> * add create script * rework some scripts Signed-off-by: Thomas Graves <tgraves@nvidia.com> * fix is_cluster_running Signed-off-by: Thomas Graves <tgraves@nvidia.com> * put slack back in * update text * cleanup Signed-off-by: Thomas Graves <tgraves@nvidia.com> * remove datetime * send output to stderr Signed-off-by: Thomas Graves <tgraves@nvidia.com> Co-authored-by: Andy Grove <andygrove@users.noreply.github.com> Co-authored-by: Jason Lowe <jlowe@nvidia.com> Co-authored-by: Robert (Bobby) Evans <bobby@apache.org>
sperlingxx · Nov 20, 2020 · 9304dc6 · 9304dc6
1 parent 6626ff1
commit 9304dc6
Show file tree

Hide file tree

Showing 6 changed files with 311 additions and 107 deletions.
diff --git a/jenkins/Jenkinsfile.databricksnightly b/jenkins/Jenkinsfile.databricksnightly
@@ -36,6 +36,8 @@ pipeline {
 
     options {
         ansiColor('xterm')
+        // timeout doesn't seem to work with environment variables so make sure to update below
+        // IDLE_TIMEOUT config as well
         timeout(time: 180, unit: 'MINUTES')
         buildDiscarder(logRotator(numToKeepStr: '10'))
     }
@@ -44,24 +46,28 @@ pipeline {
         choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
             description: 'Where to deploy artifacts to')
         string(name: 'DATABRICKS_VERSION',
-                defaultValue: '0.3.0-SNAPSHOT', description: 'Version to set')
+                defaultValue: '0.3.0-SNAPSHOT', description: 'Version to use for databricks jar produced')
         string(name: 'CUDF_VERSION',
                 defaultValue: '0.16-SNAPSHOT', description: 'Cudf version to use')
         string(name: 'CUDA_VERSION',
                 defaultValue: 'cuda10-1', description: 'cuda version to use')
-        string(name: 'CLUSTER_ID',
-                defaultValue: '0909-141326-pawl52', description: 'databricks cluster id')
+        string(name: 'RUNTIME',
+                defaultValue: '7.0.x-gpu-ml-scala2.12', description: 'databricks runtime')
+        string(name: 'PUBLIC_KEY',
+                defaultValue: '\"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDB+ValakyoKn7w+iBRoAi1KlLVH4yVmRXhLCZs1qUECBAhbck2o8Lgjp5wJ+epdT3+EAP2+t/zlK1mU9tTylidapR4fZuIk9ApoQSLOUEXcUtHkPpZulIoGAq78HoyiEs1sKovc6ULvpymjdnQ3ogCZlTlP9uqmL2E4kbtrNCNL0SVj/w10AqzrJ5lqQgO5dIdDRMHW2sv88JI1VLlfiSsofa9RdI7hDRuCnfZ0+dv2URJGGzGt2BkdEmk9t5F1BMpuXvZ8HzOYdACzw0U+THBOk9d4CACUYMyO1XqlXwoYweNKRnigXDCRaTWGFBzTkugRuW/BZBccTR1ON430uRB svcngcc@nvidia.com\"', description: 'public key')
         string(name: 'REF', defaultValue: 'branch-0.3', description: 'Commit to build')
     }
 
     environment {
+        IDLE_TIMEOUT = 180
         JENKINS_ROOT  = 'jenkins'
         MVN_URM_MIRROR='-s jenkins/settings.xml -P mirror-apache-to-urm'
         LIBCUDF_KERNEL_CACHE_PATH='/tmp'
         URM_CREDS = credentials("svcngcc_artifactory")
         DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
         SCALA_VERSION = '2.12'
-        SPARK_VERSION = '3.0.0-databricks'
+        // the spark version used when we install databricks jars into .m2 directory
+        SPARK_VERSION_TO_INSTALL_JARS = '3.0.0-databricks'
         CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
         CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
         URM_URL = "${urmUrl}"
@@ -78,7 +84,11 @@ pipeline {
                     sshagent(credentials : ['svcngcc_pubpriv']) {
                         sh "rm -rf spark-rapids-ci.tgz"
                         sh "tar -zcvf spark-rapids-ci.tgz *"
-                        sh "python3.6 ./jenkins/databricks/run-tests.py -c $CLUSTER_ID -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
+                        env.CLUSTERID = sh (
+                            script: "python3.6 ./jenkins/databricks/create.py -t $DATABRICKS_TOKEN -k $PUBLIC_KEY -r $RUNTIME -i $IDLE_TIMEOUT -n CI-GPU-databricks-${DATABRICKS_VERSION}",
+                            returnStdout: true
+                        ).trim()
+                        sh "python3.6 ./jenkins/databricks/run-tests.py -c ${env.CLUSTERID} -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION_TO_INSTALL_JARS -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
                         sh "./jenkins/databricks/deploy.sh"
                     }
                 }
@@ -88,7 +98,7 @@ pipeline {
     post {
         always {
             script {
-                sh "python3.6 ./jenkins/databricks/shutdown.py -c $CLUSTER_ID -t $DATABRICKS_TOKEN || true"
+                sh "python3.6 ./jenkins/databricks/shutdown.py -c ${env.CLUSTERID} -t $DATABRICKS_TOKEN -d || true"
                 if (currentBuild.currentResult == "SUCCESS") {
                     slack("#swrapids-spark-cicd", "Success", color: "#33CC33")
                 } else {

diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
@@ -17,29 +17,30 @@
 
 set -e
 
-SPARKTGZ=$1
-DATABRICKS_VERSION=$2
+SPARKSRCTGZ=$1
+# this should match whatever is in the pom files for the version
+SPARK_PLUGIN_JAR_VERSION=$2
 SCALA_VERSION=$3
 CI_RAPIDS_JAR=$4
-SPARK_VERSION=$5
+# the version of spark used when we install the databricks jars in .m2
+SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=$5
 CUDF_VERSION=$6
 CUDA_VERSION=$7
 CI_CUDF_JAR=$8
+# version of Apache Spark we are building against
 BASE_SPARK_POM_VERSION=$9
 
-echo "Spark version is $SPARK_VERSION"
+echo "Spark version is $SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS"
 echo "scala version is: $SCALA_VERSION"
 
 # this has to match the Databricks init script
-DB_JAR_LOC=/databricks/jars
-DB_RAPIDS_JAR_LOC=$DB_JAR_LOC/$CI_RAPIDS_JAR
-DB_CUDF_JAR_LOC=$DB_JAR_LOC/$CI_CUDF_JAR
-RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar
+DB_JAR_LOC=/databricks/jars/
+RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar
 
 sudo apt install -y maven
 rm -rf spark-rapids
 mkdir spark-rapids
-tar -zxvf $SPARKTGZ -C spark-rapids
+tar -zxvf $SPARKSRCTGZ -C spark-rapids
 cd spark-rapids
 export WORKSPACE=`pwd`
 mvn -B '-Pdatabricks,!snapshot-shims' clean package -DskipTests || true
@@ -60,7 +61,7 @@ mvn -B install:install-file \
    -Dfile=$JARDIR/$COREJAR \
    -DgroupId=org.apache.spark \
    -DartifactId=spark-core_$SCALA_VERSION \
-   -Dversion=$SPARK_VERSION \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar \
    -DpomFile=$COREPOMPATH/$COREPOM
 
@@ -69,33 +70,34 @@ mvn -B install:install-file \
    -Dfile=$JARDIR/$CATALYSTJAR \
    -DgroupId=org.apache.spark \
    -DartifactId=spark-catalyst_$SCALA_VERSION \
-   -Dversion=$SPARK_VERSION \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar
 
 mvn -B install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$SQLJAR \
    -DgroupId=org.apache.spark \
    -DartifactId=spark-sql_$SCALA_VERSION \
-   -Dversion=$SPARK_VERSION \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar
 
 mvn -B install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$ANNOTJAR \
    -DgroupId=org.apache.spark \
    -DartifactId=spark-annotation_$SCALA_VERSION \
-   -Dversion=$SPARK_VERSION \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar
 
 mvn -B '-Pdatabricks,!snapshot-shims' clean package -DskipTests
 
+
 # Copy so we pick up new built jar and latesty CuDF jar. Note that the jar names has to be
 # exactly what is in the staticly setup Databricks cluster we use. 
-echo "Copying rapids jars: dist/target/$RAPIDS_BUILT_JAR $DB_RAPIDS_JAR_LOC"
-sudo cp dist/target/$RAPIDS_BUILT_JAR $DB_RAPIDS_JAR_LOC
-echo "Copying cudf jars: $CUDF_JAR $DB_CUDF_JAR_LOC"
-sudo cp $CUDF_JAR $DB_CUDF_JAR_LOC
+echo "Copying rapids jars: dist/target/$RAPIDS_BUILT_JAR $DB_JAR_LOC"
+sudo cp dist/target/$RAPIDS_BUILT_JAR $DB_JAR_LOC
+echo "Copying cudf jars: $CUDF_JAR $DB_JAR_LOC"
+sudo cp $CUDF_JAR $DB_JAR_LOC
 
 # tests
 export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH

diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import json
+import time
+import os
+import requests
+import sys
+
+class ClusterUtils(object):
+
+    @staticmethod
+    def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout,
+            num_workers, driver_node_type, worker_node_type,
+            printLoc=sys.stdout):
+        timeStr = str(int(time.time()))
+        uniq_name = cluster_name + "-" + timeStr
+        templ = {}
+        templ['cluster_name'] = uniq_name
+        print("cluster name is going to be %s" % uniq_name, file=printLoc)
+        templ['spark_version'] = runtime
+        templ['aws_attributes'] = {
+                    "zone_id": "us-west-2a",
+                    "first_on_demand": 1,
+                    "availability": "SPOT_WITH_FALLBACK",
+                    "spot_bid_price_percent": 100,
+                    "ebs_volume_count": 0
+                }
+        templ['autotermination_minutes'] = idle_timeout
+        templ['enable_elastic_disk'] = 'false'
+        templ['enable_local_disk_encryption'] = 'false'
+        templ['node_type_id'] = worker_node_type
+        templ['driver_node_type_id'] = driver_node_type
+        templ['ssh_public_keys'] = [ sshKey ]
+        templ['num_workers'] = num_workers
+        return templ
+
+
+    @staticmethod
+    def create_cluster(workspace, jsonCreateTempl, token, printLoc=sys.stdout):
+        resp = requests.post(workspace + "/api/2.0/clusters/create", headers={'Authorization': 'Bearer %s' % token}, json=jsonCreateTempl)
+        print("create response is %s" % resp.text, file=printLoc)
+        clusterid = resp.json()['cluster_id']
+        print("cluster id is %s" % clusterid, file=printLoc)
+        return clusterid
+
+
+    @staticmethod
+    def wait_for_cluster_start(workspace, clusterid, token, retries=20, printLoc=sys.stdout):
+        p = 0
+        waiting = True
+        jsonout = None
+        while waiting:
+            time.sleep(30)
+            jsonout = ClusterUtils.cluster_state(workspace, clusterid, token, printLoc=printLoc)
+            current_state = jsonout['state']
+            print(clusterid + " state:" + current_state, file=printLoc)
+            if current_state in ['RUNNING']:
+                break
+            if current_state in ['INTERNAL_ERROR', 'SKIPPED', 'TERMINATED'] or p >= 20:
+                if p >= retries:
+                   print("Waited %d times already, stopping" % p)
+                sys.exit(4)
+            p = p + 1
+        print("Done starting cluster", file=printLoc)
+        return jsonout
+
+
+    @staticmethod
+    def is_cluster_running(jsonout):
+        current_state = jsonout['state']
+        if current_state in ['RUNNING', 'RESIZING']:
+            return True
+        else:
+            return False
+
+
+    @staticmethod
+    def terminate_cluster(workspace, clusterid, token, printLoc=sys.stdout):
+        jsonout = ClusterUtils.cluster_state(workspace, clusterid, token, printLoc=printLoc)
+        if not ClusterUtils.is_cluster_unning(jsonout):
+            print("Cluster is not running", file=printLoc)
+            sys.exit(1)
+
+        print("Stopping cluster: " + clusterid, file=printLoc)
+        resp = requests.post(workspace + "/api/2.0/clusters/delete", headers={'Authorization': 'Bearer %s' % token}, json={'cluster_id': clusterid})
+        print("stop response is %s" % resp.text, file=printLoc)
+        print("Done stopping cluster", file=printLoc)
+
+
+    @staticmethod
+    def delete_cluster(workspace, clusterid, token, printLoc=sys.stdout):
+        print("Deleting cluster: " + clusterid, file=printLoc)
+        resp = requests.post(workspace + "/api/2.0/clusters/permanent-delete", headers={'Authorization': 'Bearer %s' % token}, json={'cluster_id': clusterid})
+        print("delete response is %s" % resp.text, file=printLoc)
+        print("Done deleting cluster", file=printLoc)
+
+
+    @staticmethod
+    def start_existing_cluster(workspace, clusterid, token, printLoc=sys.stdout):
+        print("Starting cluster: " + clusterid, file=printLoc)
+        resp = requests.post(workspace + "/api/2.0/clusters/start", headers={'Authorization': 'Bearer %s' % token}, json={'cluster_id': clusterid})
+        print("start response is %s" % resp.text, file=printLoc)
+
+
+    @staticmethod
+    def cluster_state(workspace, clusterid, token, printLoc=sys.stdout):
+        clusterresp = requests.get(workspace + "/api/2.0/clusters/get?cluster_id=%s" % clusterid, headers={'Authorization': 'Bearer %s' % token})
+        clusterjson = clusterresp.text
+        print("cluster response is %s" % clusterjson, file=printLoc)
+        jsonout = json.loads(clusterjson)
+        return jsonout
+
+
+    @staticmethod
+    def get_master_addr_from_json(jsonout):
+        master_addr = None
+        if ClusterUtils.is_cluster_running(jsonout):
+            driver = jsonout['driver']
+            master_addr = driver["public_dns"]
+        return master_addr
+
+
+    @staticmethod
+    def cluster_list(workspace, token, printLoc=sys.stdout):
+        clusterresp = requests.get(workspace + "/api/2.0/clusters/list", headers={'Authorization': 'Bearer %s' % token})
+        clusterjson = clusterresp.text
+        print("cluster list is %s" % clusterjson, file=printLoc)
+        jsonout = json.loads(clusterjson)
+        return jsonout
+
+
+    @staticmethod
+    def cluster_get_master_addr(workspace, clusterid, token, printLoc=sys.stdout):
+        jsonout = ClusterUtils.cluster_state(workspace, clusterid, token, printLoc=printLoc)
+        addr = ClusterUtils.get_master_addr_from_json(jsonout)
+        print("master addr is %s" % addr, file=printLoc)
+        return addr
+