-
Notifications
You must be signed in to change notification settings - Fork 232
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Split DB scripts to make them common for the build and IT pipeline #1933
Changes from 3 commits
8d829de
19f46fe
e1b88d4
e86643b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,21 +23,22 @@ class ClusterUtils(object): | |
|
||
@staticmethod | ||
def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout, | ||
num_workers, driver_node_type, worker_node_type, | ||
num_workers, driver_node_type, worker_node_type, cluster_type, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add 'cluster_type' parameter to make create.py common for aws and azure Databricks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't we need whatever equivalent for azure or those just aren't specified? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also perhaps a better name for this would be like cloud_provider. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I thinks we need not specify Also
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sounds good, let me change it. |
||
printLoc=sys.stdout): | ||
timeStr = str(int(time.time())) | ||
uniq_name = cluster_name + "-" + timeStr | ||
templ = {} | ||
templ['cluster_name'] = uniq_name | ||
print("cluster name is going to be %s" % uniq_name, file=printLoc) | ||
templ['spark_version'] = runtime | ||
templ['aws_attributes'] = { | ||
"zone_id": "us-west-2a", | ||
"first_on_demand": 1, | ||
"availability": "SPOT_WITH_FALLBACK", | ||
"spot_bid_price_percent": 100, | ||
"ebs_volume_count": 0 | ||
} | ||
if (cluster_type == 'aws'): | ||
templ['aws_attributes'] = { | ||
"zone_id": "us-west-2a", | ||
"first_on_demand": 1, | ||
"availability": "SPOT_WITH_FALLBACK", | ||
"spot_bid_price_percent": 100, | ||
"ebs_volume_count": 0 | ||
} | ||
templ['autotermination_minutes'] = idle_timeout | ||
templ['enable_elastic_disk'] = 'false' | ||
templ['enable_local_disk_encryption'] = 'false' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,20 +33,21 @@ def main(): | |
num_workers = 1 | ||
worker_type = 'g4dn.xlarge' | ||
driver_type = 'g4dn.xlarge' | ||
cluster_type = 'aws' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add 'cluster_type' parameter to make create.py common for aws and azure Databricks |
||
|
||
try: | ||
opts, args = getopt.getopt(sys.argv[1:], 'hw:t:k:n:i:r:o:d:e:', | ||
opts, args = getopt.getopt(sys.argv[1:], 'hw:t:k:n:i:r:o:d:e:s:', | ||
['workspace=', 'token=', 'sshkey=', 'clustername=', 'idletime=', | ||
'runtime=', 'workertype=', 'drivertype=', 'numworkers=']) | ||
'runtime=', 'workertype=', 'drivertype=', 'numworkers=', 'clustertype=']) | ||
except getopt.GetoptError: | ||
print( | ||
'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers>') | ||
'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers> -s <clustertype>') | ||
sys.exit(2) | ||
|
||
for opt, arg in opts: | ||
if opt == '-h': | ||
print( | ||
'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers>') | ||
'create.py -w <workspace> -t <token> -k <sshkey> -n <clustername> -i <idletime> -r <runtime> -o <workernodetype> -d <drivernodetype> -e <numworkers> -s <clustertype>') | ||
sys.exit() | ||
elif opt in ('-w', '--workspace'): | ||
workspace = arg | ||
|
@@ -66,6 +67,8 @@ def main(): | |
driver_type = arg | ||
elif opt in ('-e', '--numworkers'): | ||
num_workers = arg | ||
elif opt in ('-s', '--clustertype'): | ||
cluster_type = arg | ||
|
||
print('-w is ' + workspace, file=sys.stderr) | ||
print('-k is ' + sshkey, file=sys.stderr) | ||
|
@@ -75,6 +78,7 @@ def main(): | |
print('-o is ' + worker_type, file=sys.stderr) | ||
print('-d is ' + driver_type, file=sys.stderr) | ||
print('-e is ' + str(num_workers), file=sys.stderr) | ||
print('-s is ' + cluster_type, file=sys.stderr) | ||
|
||
if not sshkey: | ||
print("You must specify an sshkey!", file=sys.stderr) | ||
|
@@ -85,7 +89,7 @@ def main(): | |
sys.exit(2) | ||
|
||
templ = ClusterUtils.generate_create_templ(sshkey, cluster_name, runtime, idletime, | ||
num_workers, driver_type, worker_type, printLoc=sys.stderr) | ||
num_workers, driver_type, worker_type, cluster_type, printLoc=sys.stderr) | ||
clusterid = ClusterUtils.create_cluster(workspace, templ, token, printLoc=sys.stderr) | ||
ClusterUtils.wait_for_cluster_start(workspace, clusterid, token, printLoc=sys.stderr) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Copyright (c) 2021, NVIDIA CORPORATION. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the parameter parsing scripts common for both 'run-build.py' and 'run-test.py' |
||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import sys | ||
import getopt | ||
|
||
workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com' | ||
token = '' | ||
private_key_file = "~/.ssh/id_rsa" | ||
local_script = 'build.sh' | ||
script_dest = '/home/ubuntu/build.sh' | ||
source_tgz = 'spark-rapids-ci.tgz' | ||
tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz' | ||
base_spark_pom_version = '3.0.0' | ||
clusterid = '' | ||
build_profiles = 'databricks,!snapshot-shims' | ||
jar_path = '' | ||
|
||
try: | ||
opts, args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:', | ||
['workspace=', 'token=', 'clusterid=', 'private=', 'localscript=', 'dest=', 'sparktgz=', 'basesparkpomversion=', 'buildprofiles=', 'jarpath']) | ||
except getopt.GetoptError: | ||
print( | ||
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles> -j <jarpath>') | ||
sys.exit(2) | ||
|
||
for opt, arg in opts: | ||
if opt == '-h': | ||
print( | ||
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>') | ||
sys.exit() | ||
elif opt in ('-w', '--workspace'): | ||
workspace = arg | ||
elif opt in ('-t', '--token'): | ||
token = arg | ||
elif opt in ('-c', '--clusterid'): | ||
clusterid = arg | ||
elif opt in ('-p', '--private'): | ||
private_key_file = arg | ||
elif opt in ('-l', '--localscript'): | ||
local_script = arg | ||
elif opt in ('-d', '--dest'): | ||
script_dest = arg | ||
elif opt in ('-z', '--sparktgz'): | ||
source_tgz = arg | ||
elif opt in ('-v', '--basesparkpomversion'): | ||
base_spark_pom_version = arg | ||
elif opt in ('-b', '--bulidprofiles'): | ||
build_profiles = arg | ||
elif opt in ('-j', '--jarpath'): | ||
jar_path = arg | ||
|
||
print('-w is ' + workspace) | ||
print('-c is ' + clusterid) | ||
print('-p is ' + private_key_file) | ||
print('-l is ' + local_script) | ||
print('-d is ' + script_dest) | ||
print('-z is ' + source_tgz) | ||
print('-v is ' + base_spark_pom_version) | ||
print('-j is ' + jar_path) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (c) 2021, NVIDIA CORPORATION. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import json | ||
import requests | ||
import sys | ||
import getopt | ||
import time | ||
import os | ||
import subprocess | ||
from clusterutils import ClusterUtils | ||
import params | ||
|
||
def main(): | ||
master_addr = ClusterUtils.cluster_get_master_addr(params.workspace, params.clusterid, params.token) | ||
if master_addr is None: | ||
print("Error, didn't get master address") | ||
sys.exit(1) | ||
print("Master node address is: %s" % master_addr) | ||
|
||
print("Copying script") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest) | ||
print("rsync command: %s" % rsync_command) | ||
subprocess.check_call(rsync_command, shell = True) | ||
|
||
print("Copying source") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.source_tgz, master_addr, params.tgz_dest) | ||
print("rsync command: %s" % rsync_command) | ||
subprocess.check_call(rsync_command, shell = True) | ||
|
||
ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, params.private_key_file, params.script_dest, params.tgz_dest, params.base_spark_pom_version, params.build_profiles) | ||
print("ssh command: %s" % ssh_command) | ||
subprocess.check_call(ssh_command, shell = True) | ||
|
||
print("Copying built tarball back") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (params.private_key_file, master_addr) | ||
tgravescs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
print("rsync command to get built tarball: %s" % rsync_command) | ||
subprocess.check_call(rsync_command, shell = True) | ||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2020, NVIDIA CORPORATION. | ||
# Copyright (c) 2020-2021, NVIDIA CORPORATION. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
|
@@ -19,84 +19,25 @@ | |
import os | ||
import subprocess | ||
from clusterutils import ClusterUtils | ||
import params | ||
|
||
|
||
def main(): | ||
workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the parameter parsing scripts common for both 'run-build.py' and 'run-test.py', move parsing code into params.py |
||
token = '' | ||
private_key_file = "~/.ssh/id_rsa" | ||
local_script = 'build.sh' | ||
script_dest = '/home/ubuntu/build.sh' | ||
source_tgz = 'spark-rapids-ci.tgz' | ||
tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz' | ||
base_spark_pom_version = '3.0.0' | ||
clusterid = '' | ||
build_profiles = 'databricks,!snapshot-shims' | ||
|
||
try: | ||
opts, args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:', | ||
['workspace=', 'token=', 'clusterid=', 'private=', 'localscript=', 'dest=', 'sparktgz=', 'basesparkpomversion=', 'buildprofiles=']) | ||
except getopt.GetoptError: | ||
print( | ||
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>') | ||
sys.exit(2) | ||
|
||
for opt, arg in opts: | ||
if opt == '-h': | ||
print( | ||
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>') | ||
sys.exit() | ||
elif opt in ('-w', '--workspace'): | ||
workspace = arg | ||
elif opt in ('-t', '--token'): | ||
token = arg | ||
elif opt in ('-c', '--clusterid'): | ||
clusterid = arg | ||
elif opt in ('-p', '--private'): | ||
private_key_file = arg | ||
elif opt in ('-l', '--localscript'): | ||
local_script = arg | ||
elif opt in ('-d', '--dest'): | ||
script_dest = arg | ||
elif opt in ('-z', '--sparktgz'): | ||
source_tgz = arg | ||
elif opt in ('-v', '--basesparkpomversion'): | ||
base_spark_pom_version = arg | ||
elif opt in ('-b', '--bulidprofiles'): | ||
build_profiles = arg | ||
|
||
print('-w is ' + workspace) | ||
print('-c is ' + clusterid) | ||
print('-p is ' + private_key_file) | ||
print('-l is ' + local_script) | ||
print('-d is ' + script_dest) | ||
print('-z is ' + source_tgz) | ||
print('-v is ' + base_spark_pom_version) | ||
print('-b is ' + build_profiles) | ||
|
||
master_addr = ClusterUtils.cluster_get_master_addr(workspace, clusterid, token) | ||
master_addr = ClusterUtils.cluster_get_master_addr(params.workspace, params.clusterid, params.token) | ||
if master_addr is None: | ||
print("Error, didn't get master address") | ||
sys.exit(1) | ||
print("Master node address is: %s" % master_addr) | ||
print("Copying script") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, local_script, master_addr, script_dest) | ||
print("rsync command: %s" % rsync_command) | ||
subprocess.check_call(rsync_command, shell = True) | ||
|
||
print("Copying source") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, source_tgz, master_addr, tgz_dest) | ||
print("Copying script") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest) | ||
print("rsync command: %s" % rsync_command) | ||
subprocess.check_call(rsync_command, shell = True) | ||
|
||
ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, base_spark_pom_version, build_profiles) | ||
ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s 2>&1 | tee testout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, params.private_key_file, params.script_dest, params.jar_path) | ||
print("ssh command: %s" % ssh_command) | ||
subprocess.check_call(ssh_command, shell = True) | ||
|
||
print("Copying built tarball back") | ||
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (private_key_file, master_addr) | ||
print("rsync command to get built tarball: %s" % rsync_command) | ||
subprocess.check_call(rsync_command, shell = True) | ||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/bin/bash | ||
# | ||
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
set -e | ||
|
||
LOCAL_JAR_PATH=$1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Build pipeline builds the jars from source code, so the LOCAL_JAR_PATH is null. IT pipeline sets the 'LOCAL_JAR_PATH' to tell where the jars are downloaded into the local directory. |
||
|
||
# tests | ||
export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move integration tests scripts into 'test.sh', to make it common for DB build & IT pipelines |
||
sudo /databricks/conda/envs/databricks-ml-gpu/bin/pip install pytest sre_yield requests pandas \ | ||
pyarrow findspark pytest-xdist | ||
|
||
export SPARK_HOME=/databricks/spark | ||
# change to not point at databricks confs so we don't conflict with their settings | ||
export SPARK_CONF_DIR=$PWD | ||
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip | ||
sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true | ||
sudo chmod 777 /databricks/data/logs/ | ||
sudo chmod 777 /databricks/data/logs/* | ||
echo { \"port\":\"15002\" } > ~/.databricks-connect | ||
|
||
if [ -d "$LOCAL_JAR_PATH" ]; then | ||
## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo | ||
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" | ||
else | ||
## Run tests with jars building from the spark-rapids source code | ||
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" | ||
fi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move integration tests scripts into 'test.sh', to make it common for DB build & IT pipelines