Split DB scripts to make them common for the build and IT pipeline (N…

…VIDIA#1933) * Split DB scripts to make them common for the build and IT pipeline Signed-off-by: Tim Liu <timl@nvidia.com> * update getopt for jar_path * Add 'cluster_type' parameter to make create.py common for aws and azure Databricks Signed-off-by: Tim Liu <timl@nvidia.com> * Change to use a more readable var name change the var name from cluster_type to cloud_provider. Signed-off-by: Tim Liu <timl@nvidia.com>
nartal1 · Mar 16, 2021 · 442b518 · 442b518
1 parent d54b86f
commit 442b518
Show file tree

Hide file tree

Showing 7 changed files with 188 additions and 109 deletions.
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
@@ -103,36 +103,5 @@ mvn -B install:install-file \
 
 mvn -B -P${BUILD_PROFILES} clean package -DskipTests
 
-# Copy so we pick up new built jar and latest CuDF jar. Note that the jar names have to be
-# exactly what is in the statically setup Databricks cluster we use.
-echo "Copying rapids jars: dist/target/$RAPIDS_BUILT_JAR udf-examples/target/$RAPIDS_UDF_JAR $DB_JAR_LOC"
-sudo cp dist/target/$RAPIDS_BUILT_JAR udf-examples/target/$RAPIDS_UDF_JAR $DB_JAR_LOC
-echo "Copying cudf jars: $CUDF_JAR $DB_JAR_LOC"
-sudo cp $CUDF_JAR $DB_JAR_LOC
-
-# tests
-export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH
-sudo /databricks/conda/envs/databricks-ml-gpu/bin/pip install pytest sre_yield requests pandas \
-	pyarrow findspark pytest-xdist
-cd /home/ubuntu/spark-rapids/integration_tests
-export SPARK_HOME=/databricks/spark
-# change to not point at databricks confs so we don't conflict with their settings
-export SPARK_CONF_DIR=$PWD
-export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
-sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
-sudo chmod 777 /databricks/data/logs/
-sudo chmod 777 /databricks/data/logs/*
-echo { \"port\":\"15002\" } > ~/.databricks-connect
-if [ `ls $DB_JAR_LOC/rapids* | wc -l` -gt 2 ]; then
-    echo "ERROR: Too many rapids jars in $DB_JAR_LOC"
-    ls $DB_JAR_LOC/rapids*
-    exit 1
-fi
-if [ `ls $DB_JAR_LOC/cudf* | wc -l` -gt 1 ]; then
-    echo "ERROR: Too many cudf jars in $DB_JAR_LOC"
-    ls $DB_JAR_LOC/cudf*
-    exit 1
-fi
-bash run_pyspark_from_build.sh --runtime_env="databricks"
 cd /home/ubuntu
 tar -zcvf spark-rapids-built.tgz spark-rapids
diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py
@@ -23,21 +23,22 @@ class ClusterUtils(object):
 
     @staticmethod
     def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout,
-            num_workers, driver_node_type, worker_node_type,
+            num_workers, driver_node_type, worker_node_type, cloud_provider,
             printLoc=sys.stdout):
         timeStr = str(int(time.time()))
         uniq_name = cluster_name + "-" + timeStr
         templ = {}
         templ['cluster_name'] = uniq_name
         print("cluster name is going to be %s" % uniq_name, file=printLoc)
         templ['spark_version'] = runtime
-        templ['aws_attributes'] = {
-                    "zone_id": "us-west-2a",
-                    "first_on_demand": 1,
-                    "availability": "SPOT_WITH_FALLBACK",
-                    "spot_bid_price_percent": 100,
-                    "ebs_volume_count": 0
-                }
+        if (cloud_provider == 'aws'):
+            templ['aws_attributes'] = {
+                        "zone_id": "us-west-2a",
+                        "first_on_demand": 1,
+                        "availability": "SPOT_WITH_FALLBACK",
+                        "spot_bid_price_percent": 100,
+                        "ebs_volume_count": 0
+            }
         templ['autotermination_minutes'] = idle_timeout
         templ['enable_elastic_disk'] = 'false'
         templ['enable_local_disk_encryption'] = 'false'

diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
@@ -33,20 +33,21 @@ def main():
   num_workers = 1
   worker_type = 'g4dn.xlarge'
   driver_type = 'g4dn.xlarge'
+  cloud_provider = 'aws'
 
   try:
-      opts, args = getopt.getopt(sys.argv[1:], 'hw:t:k:n:i:r:o:d:e:',
+      opts, args = getopt.getopt(sys.argv[1:], 'hw:t:k:n:i:r:o:d:e:s:',
                                  ['workspace=', 'token=', 'sshkey=', 'clustername=', 'idletime=',
-                                     'runtime=', 'workertype=', 'drivertype=', 'numworkers='])
+                                     'runtime=', 'workertype=', 'drivertype=', 'numworkers=', 'cloudprovider='])
   except getopt.GetoptError:
       print(
-          'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers>')
+          'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers> -s <cloudprovider>')
       sys.exit(2)
 
   for opt, arg in opts:
       if opt == '-h':
           print(
-              'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers>')
+              'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers> -s <cloudprovider>')
           sys.exit()
       elif opt in ('-w', '--workspace'):
           workspace = arg
@@ -66,6 +67,8 @@ def main():
           driver_type = arg
       elif opt in ('-e', '--numworkers'):
           num_workers = arg
+      elif opt in ('-s', '--cloudprovider'):
+          cloud_provider = arg
 
   print('-w is ' + workspace, file=sys.stderr)
   print('-k is ' + sshkey, file=sys.stderr)
@@ -75,6 +78,7 @@ def main():
   print('-o is ' + worker_type, file=sys.stderr)
   print('-d is ' + driver_type, file=sys.stderr)
   print('-e is ' + str(num_workers), file=sys.stderr)
+  print('-s is ' + cloud_provider, file=sys.stderr)
 
   if not sshkey:
       print("You must specify an sshkey!", file=sys.stderr)
@@ -85,7 +89,7 @@ def main():
       sys.exit(2)
 
   templ = ClusterUtils.generate_create_templ(sshkey, cluster_name, runtime, idletime,
-          num_workers, driver_type, worker_type, printLoc=sys.stderr)
+          num_workers, driver_type, worker_type, cloud_provider, printLoc=sys.stderr)
   clusterid = ClusterUtils.create_cluster(workspace, templ, token, printLoc=sys.stderr)
   ClusterUtils.wait_for_cluster_start(workspace, clusterid, token, printLoc=sys.stderr)
 

diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import getopt
+
+workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
+token = ''
+private_key_file = "~/.ssh/id_rsa"
+local_script = 'build.sh'
+script_dest = '/home/ubuntu/build.sh'
+source_tgz = 'spark-rapids-ci.tgz'
+tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz'
+base_spark_pom_version = '3.0.0'
+clusterid = ''
+build_profiles = 'databricks,!snapshot-shims'
+jar_path = ''
+
+try:
+    opts, args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:',
+                               ['workspace=', 'token=', 'clusterid=', 'private=', 'localscript=', 'dest=', 'sparktgz=', 'basesparkpomversion=', 'buildprofiles=', 'jarpath'])
+except getopt.GetoptError:
+    print(
+        'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles> -j <jarpath>')
+    sys.exit(2)
+
+for opt, arg in opts:
+    if opt == '-h':
+        print(
+            'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>')
+        sys.exit()
+    elif opt in ('-w', '--workspace'):
+        workspace = arg
+    elif opt in ('-t', '--token'):
+        token = arg
+    elif opt in ('-c', '--clusterid'):
+        clusterid = arg
+    elif opt in ('-p', '--private'):
+        private_key_file = arg
+    elif opt in ('-l', '--localscript'):
+        local_script = arg
+    elif opt in ('-d', '--dest'):
+        script_dest = arg
+    elif opt in ('-z', '--sparktgz'):
+        source_tgz = arg
+    elif opt in ('-v', '--basesparkpomversion'):
+        base_spark_pom_version = arg
+    elif opt in ('-b', '--bulidprofiles'):
+        build_profiles = arg
+    elif opt in ('-j', '--jarpath'):
+        jar_path = arg
+
+print('-w is ' + workspace)
+print('-c is ' + clusterid)
+print('-p is ' + private_key_file)
+print('-l is ' + local_script)
+print('-d is ' + script_dest)
+print('-z is ' + source_tgz)
+print('-v is ' + base_spark_pom_version)
+print('-j is ' + jar_path)
diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import requests
+import sys
+import getopt
+import time
+import os
+import subprocess
+from clusterutils import ClusterUtils
+import params
+
+def main():
+  master_addr = ClusterUtils.cluster_get_master_addr(params.workspace, params.clusterid, params.token)
+  if master_addr is None:
+      print("Error, didn't get master address")
+      sys.exit(1)
+  print("Master node address is: %s" % master_addr)
+
+  print("Copying script")
+  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest)
+  print("rsync command: %s" % rsync_command)
+  subprocess.check_call(rsync_command, shell = True)
+
+  print("Copying source")
+  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.source_tgz, master_addr, params.tgz_dest)
+  print("rsync command: %s" % rsync_command)
+  subprocess.check_call(rsync_command, shell = True)
+
+  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, params.private_key_file, params.script_dest, params.tgz_dest, params.base_spark_pom_version, params.build_profiles)
+  print("ssh command: %s" % ssh_command)
+  subprocess.check_call(ssh_command, shell = True)
+
+  print("Copying built tarball back")
+  rsync_command = "rsync  -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (params.private_key_file, master_addr)
+  print("rsync command to get built tarball: %s" % rsync_command)
+  subprocess.check_call(rsync_command, shell = True)
+
+if __name__ == '__main__':
+  main()
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,84 +19,25 @@
 import os
 import subprocess
 from clusterutils import ClusterUtils
+import params
 
 
 def main():
-  workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
-  token = ''
-  private_key_file = "~/.ssh/id_rsa"
-  local_script = 'build.sh'
-  script_dest = '/home/ubuntu/build.sh'
-  source_tgz = 'spark-rapids-ci.tgz'
-  tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz'
-  base_spark_pom_version = '3.0.0'
-  clusterid = ''
-  build_profiles = 'databricks,!snapshot-shims'
 
-  try:
-      opts, args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:',
-                                 ['workspace=', 'token=', 'clusterid=', 'private=', 'localscript=', 'dest=', 'sparktgz=', 'basesparkpomversion=', 'buildprofiles='])
-  except getopt.GetoptError:
-      print(
-          'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>')
-      sys.exit(2)
-
-  for opt, arg in opts:
-      if opt == '-h':
-          print(
-              'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>')
-          sys.exit()
-      elif opt in ('-w', '--workspace'):
-          workspace = arg
-      elif opt in ('-t', '--token'):
-          token = arg
-      elif opt in ('-c', '--clusterid'):
-          clusterid = arg
-      elif opt in ('-p', '--private'):
-          private_key_file = arg
-      elif opt in ('-l', '--localscript'):
-          local_script = arg
-      elif opt in ('-d', '--dest'):
-          script_dest = arg
-      elif opt in ('-z', '--sparktgz'):
-          source_tgz = arg
-      elif opt in ('-v', '--basesparkpomversion'):
-          base_spark_pom_version = arg
-      elif opt in ('-b', '--bulidprofiles'):
-          build_profiles = arg
-
-  print('-w is ' + workspace)
-  print('-c is ' + clusterid)
-  print('-p is ' + private_key_file)
-  print('-l is ' + local_script)
-  print('-d is ' + script_dest)
-  print('-z is ' + source_tgz)
-  print('-v is ' + base_spark_pom_version)
-  print('-b is ' + build_profiles)
-
-  master_addr = ClusterUtils.cluster_get_master_addr(workspace, clusterid, token)
+  master_addr = ClusterUtils.cluster_get_master_addr(params.workspace, params.clusterid, params.token)
   if master_addr is None:
       print("Error, didn't get master address")
       sys.exit(1)
   print("Master node address is: %s" % master_addr)
-  print("Copying script")
-  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, local_script, master_addr, script_dest)
-  print("rsync command: %s" % rsync_command)
-  subprocess.check_call(rsync_command, shell = True)
 
-  print("Copying source")
-  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, source_tgz, master_addr, tgz_dest)
+  print("Copying script")
+  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest)
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
-  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, base_spark_pom_version, build_profiles)
+  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s 2>&1 | tee testout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, params.private_key_file, params.script_dest, params.jar_path)
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 
-  print("Copying built tarball back")
-  rsync_command = "rsync  -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (private_key_file, master_addr)
-  print("rsync command to get built tarball: %s" % rsync_command)
-  subprocess.check_call(rsync_command, shell = True)
-
 if __name__ == '__main__':
   main()
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+LOCAL_JAR_PATH=$1
+
+# tests
+export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH
+sudo /databricks/conda/envs/databricks-ml-gpu/bin/pip install pytest sre_yield requests pandas \
+	pyarrow findspark pytest-xdist
+
+export SPARK_HOME=/databricks/spark
+# change to not point at databricks confs so we don't conflict with their settings
+export SPARK_CONF_DIR=$PWD
+export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
+sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
+sudo chmod 777 /databricks/data/logs/
+sudo chmod 777 /databricks/data/logs/*
+echo { \"port\":\"15002\" } > ~/.databricks-connect
+
+if [ -d "$LOCAL_JAR_PATH" ]; then
+    ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
+    LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" 
+else
+    ## Run tests with jars building from the spark-rapids source code
+    bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"
+fi