From 6aeb15ae58fa030fc064373043f74e5692e5eaaf Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Mon, 29 Jun 2020 10:07:36 -0400 Subject: [PATCH] Databricks CI improvements and support runtime env parameter to xfail certain tests (#297) * Add a profile to build for Databricks * Change xfail to be annotation Co-authored-by: Thomas Graves --- integration_tests/conftest.py | 3 + integration_tests/src/main/python/conftest.py | 14 ++++ .../src/main/python/parquet_test.py | 3 + .../src/main/python/qa_nightly_sql.py | 19 +++-- jenkins/Jenkinsfile.databricksnightly | 62 ++++++-------- jenkins/Jenkinsfile.databricksrelease | 16 +++- jenkins/databricks/build.sh | 83 +++++++++++-------- jenkins/databricks/dbimports.patch | 14 ++-- jenkins/databricks/deploy.sh | 2 +- jenkins/databricks/run-tests.py | 57 ++++++++++--- jenkins/databricks/shutdown.py | 2 +- pom.xml | 8 ++ 12 files changed, 178 insertions(+), 105 deletions(-) diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py index 3c30b368d99..d8882de2b72 100644 --- a/integration_tests/conftest.py +++ b/integration_tests/conftest.py @@ -41,3 +41,6 @@ def pytest_addoption(parser): parser.addoption( "--debug_tmp_path", action='store_true', default=False, help="if true don't delete tmp_path contents for debugging" ) + parser.addoption( + "--runtime_env", action='store', default="Apache", help="the runtime environment for the tests - apache or databricks" + ) diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py index 0f60e283004..506e8be8be5 100644 --- a/integration_tests/src/main/python/conftest.py +++ b/integration_tests/src/main/python/conftest.py @@ -48,6 +48,17 @@ def is_allowing_any_non_gpu(): def get_non_gpu_allowed(): return _non_gpu_allowed +_runtime_env = "apache" + +def runtime_env(): + return _runtime_env.lower() + +def is_apache_runtime(): + return runtime_env() == "apache" + +def is_databricks_runtime(): + return runtime_env() == "databricks" + _limit = -1 def get_limit(): @@ -112,6 +123,9 @@ def pytest_runtest_setup(item): else: _limit = -1 +def pytest_configure(config): + global _runtime_env + _runtime_env = config.getoption('runtime_env') def pytest_collection_modifyitems(config, items): for item in items: diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index ab14204e0b2..7e6509ffd09 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_databricks_runtime from datetime import date, datetime, timezone from data_gen import * from marks import * @@ -145,6 +146,8 @@ def test_simple_partitioned_read(spark_tmp_path): assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.read.parquet(data_path)) +@pytest.mark.xfail(condition=is_databricks_runtime(), + reason='https://github.com/NVIDIA/spark-rapids/issues/192') def test_read_merge_schema(spark_tmp_path): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators diff --git a/integration_tests/src/main/python/qa_nightly_sql.py b/integration_tests/src/main/python/qa_nightly_sql.py index f4ca0e7cdc9..f2e143e7551 100644 --- a/integration_tests/src/main/python/qa_nightly_sql.py +++ b/integration_tests/src/main/python/qa_nightly_sql.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from conftest import is_databricks_runtime import pytest SELECT_SQL = [ @@ -665,15 +666,15 @@ ("SELECT COUNT(byteF) as count, (AVG(intF) * 5.0) as avg, (SUM(intF) + MAX(shortF * 3)) as summax FROM test_table GROUP BY intF*3", "COUNT(byteF), AVG(intF) * 5.0, SUM(intF) + MAX(shortF * 3) GROUP BY intF*3"), ("SELECT COUNT(*) as count, (AVG(intF) * 5.0) as avg, (SUM(intF) + MAX(shortF * 3)) as summax FROM test_table GROUP BY intF*3", "COUNT(*), AVG(intF) * 5.0, SUM(intF) + MAX(shortF * 3) GROUP BY intF*3"), # ("SELECT SUM(intF) OVER (PARTITION BY byteF ORDER BY shortF) as sum_total FROM test_table", "SUM(intF) OVER (PARTITION BY byteF ORDER BY shortF) as sum_total"), -("SELECT ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num, byteF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num, byteF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num, shortF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num, shortF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num, intF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num, intF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num, longF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num, longF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num, floatF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num, floatF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num, booleanF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num, booleanF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num, strF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num, strF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num, dateF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num, dateF"), -("SELECT ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num, timestampF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num, timestampF"), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num, byteF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY byteF ORDER BY byteF) row_num, byteF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num, shortF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY shortF ORDER BY shortF) row_num, shortF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num, intF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY intF ORDER BY intF) row_num, intF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num, longF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY longF ORDER BY longF) row_num, longF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num, floatF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY floatF ORDER BY floatF) row_num, floatF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num, booleanF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY booleanF ORDER BY booleanF) row_num, booleanF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num, strF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY strF ORDER BY strF) row_num, strF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num, dateF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY dateF ORDER BY dateF) row_num, dateF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), +pytest.param(("SELECT ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num, timestampF FROM test_table", "ROW_NUMBER() OVER (PARTITION BY timestampF ORDER BY timestampF) row_num, timestampF"), marks=pytest.mark.xfail(is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/203')), # ("window/row/range (need change)", "window/row/range (need change)"), #("SELECT byteF, SUM(byteF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total FROM test_table", "byteF, SUM(byteF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total"), #("SELECT SUM(intF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total FROM test_table", "SUM(intF) OVER (PARTITION BY byteF ORDER BY byteF RANGE BETWEEN 20 PRECEDING AND 10 FOLLOWING ) as sum_total"), diff --git a/jenkins/Jenkinsfile.databricksnightly b/jenkins/Jenkinsfile.databricksnightly index d3855535a6d..d45098d6af7 100644 --- a/jenkins/Jenkinsfile.databricksnightly +++ b/jenkins/Jenkinsfile.databricksnightly @@ -22,7 +22,15 @@ */ pipeline { - agent { label 'vanilla' } + agent { + docker { + label 'docker-gpu' + image 'urm.nvidia.com/sw-spark-docker/plugin:dev-ubuntu16-cuda10.1' + args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \ + -v ${HOME}/.zinc:${HOME}/.zinc:rw \ + -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group' + } + } options { ansiColor('xterm') @@ -33,6 +41,14 @@ pipeline { parameters { choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'], description: 'Where to deploy artifacts to') + string(name: 'DATABRICKS_VERSION', + defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set') + string(name: 'CUDF_VERSION', + defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use') + string(name: 'CUDA_VERSION', + defaultValue: 'cuda10-1', description: 'cuda version to use') + string(name: 'CLUSTER_ID', + defaultValue: '0617-140138-umiak14', description: 'databricks cluster id') string(name: 'REF', defaultValue: 'branch-0.2', description: 'Commit to build') } @@ -42,6 +58,10 @@ pipeline { LIBCUDF_KERNEL_CACHE_PATH='/tmp' URM_CREDS = credentials("svcngcc_artifactory") DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN") + SCALA_VERSION = '2.12' + SPARK_VERSION = '3.0.0' + CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar' + CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar' } triggers { @@ -50,54 +70,24 @@ pipeline { stages { stage('Ubuntu16 CUDA10.1') { - agent { - dockerfile { - label 'docker-gpu' - filename 'Dockerfile.ubuntu16' - dir "jenkins" - args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \ - -v ${HOME}/.zinc:${HOME}/.zinc:rw \ - -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group' - } - } steps { script { sshagent(credentials : ['svcngcc_pubpriv']) { - sh "mvn versions:set -DnewVersion=0.2.0-databricks-SNAPSHOT && git clean -d -f" + sh "mvn versions:set -DnewVersion=$DATABRICKS_VERSION && git clean -d -f" + sh "mvn dependency:get -Dartifact=ai.rapids:cudf:$CUDF_VERSION -Ddest=./" sh "patch -p1 < ./jenkins/databricks/dbimports.patch" - sh "tar -zcvf spark-rapids-ci.tgz * || true" - sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh" + sh "tar -zcvf spark-rapids-ci.tgz *" + sh "python3.6 ./jenkins/databricks/run-tests.py -c $CLUSTER_ID -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR" sh "./jenkins/databricks/deploy.sh" } } } } - stage('cleanup') { - agent { - dockerfile { - label 'docker-gpu' - filename 'Dockerfile.ubuntu16' - dir "jenkins" - args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \ - -v ${HOME}/.zinc:${HOME}/.zinc:rw \ - -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group' - } - } - steps { - script { - sh "python3.6 ./jenkins/databricks/shutdown.py -t $DATABRICKS_TOKEN" - } - } - } } // end of stages post { always { script { - if (currentBuild.currentResult == "SUCCESS") { - slack("#rapidsai-spark-cicd", "Success", color: "#33CC33") - } else { - slack("#rapidsai-spark-cicd", "Failed", color: "#FF0000") - } + sh "python3.6 ./jenkins/databricks/shutdown.py -c $CLUSTER_ID -t $DATABRICKS_TOKEN || true" } } } diff --git a/jenkins/Jenkinsfile.databricksrelease b/jenkins/Jenkinsfile.databricksrelease index 74d1ddeb840..c021f2e0c87 100644 --- a/jenkins/Jenkinsfile.databricksrelease +++ b/jenkins/Jenkinsfile.databricksrelease @@ -28,7 +28,7 @@ def SERVERS_MAP = [ def SEC_IDS = [ Local: ['local-gpg-passphrase', 'local-gpg-private-key', 'local-username-password'], - Sonatype: ['rapids-gpg-passphrase', 'rapids-gpg-private-key', 'sonatype-username-password'] + Sonatype: ['SPARK_RAPIDS_GPG_PASSPHRASE', 'SPARK_RAPIDS_GPG_PRIVATE_KEY', 'SPARK_SONATYPE_USERPASS'] ] pipeline { @@ -52,6 +52,12 @@ pipeline { parameters { choice(name: 'DEPLOY_TO', choices: ['Sonatype'], description: 'Where to deploy artifacts to') + string(name: 'DATABRICKS_VERSION', + defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set') + string(name: 'CUDF_VERSION', + defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use') + string(name: 'CUDA_VERSION', + defaultValue: 'cuda10-1', description: 'cuda version to use') string(name: 'REF', defaultValue: 'branch-0.2', description: 'Commit to build') } @@ -64,6 +70,10 @@ pipeline { DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN") DIST_PL='dist' SQL_PL='sql-plugin' + SCALA_VERSION = '2.12' + SPARK_VERSION = '3.0.0' + CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar' + CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar' } stages { @@ -71,10 +81,10 @@ pipeline { steps { script { sshagent(credentials : ['svcngcc_pubpriv']) { - sh "mvn versions:set -DnewVersion=0.1.0-databricks && git clean -d -f" + sh "mvn versions:set -DnewVersion=0.2.0-databricks && git clean -d -f" sh "patch -p1 < ./jenkins/databricks/dbimports.patch" sh "tar -zcvf spark-rapids-ci.tgz * || true" - sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh" + sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR" } } } diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index 5f89c25f411..0038557e83d 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -17,85 +17,98 @@ set -e -SPARKTGZ=/home/ubuntu/spark-rapids-ci.tgz -if [ "$1" != "" ]; then - SPARKTGZ=$1 -fi +SPARKTGZ=$1 +DATABRICKS_VERSION=$2 +SCALA_VERSION=$3 +CI_RAPIDS_JAR=$4 +SPARK_VERSION=$5 +CUDF_VERSION=$6 +CUDA_VERSION=$7 +CI_CUDF_JAR=$8 + +echo "Spark version is $SPARK_VERSION" +echo "scala version is: $SCALA_VERSION" + +# this has to match the Databricks init script +DB_JAR_LOC=/databricks/jars +DB_RAPIDS_JAR_LOC=$DB_JAR_LOC/$CI_RAPIDS_JAR +DB_CUDF_JAR_LOC=$DB_JAR_LOC/$CI_CUDF_JAR +RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar sudo apt install -y maven rm -rf spark-rapids mkdir spark-rapids tar -zxvf $SPARKTGZ -C spark-rapids cd spark-rapids -# pull 3.0.0 artifacts and ignore errors then install databricks jars, then build again mvn clean package || true M2DIR=/home/ubuntu/.m2/repository +CUDF_JAR=./cudf-${CUDF_VERSION}.jar +mvn install:install-file \ + -Dmaven.repo.local=$M2DIR \ + -Dfile=./$CUDF_JAR \ + -DgroupId=ai.rapids \ + -DartifactId=cudf \ + -Dversion=$CUDF_VERSION \ + -Dclassifier=$CUDA_VERSION \ + -Dpackaging=jar + +# pull normal Spark artifacts and ignore errors then install databricks jars, then build again JARDIR=/databricks/jars -SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_2.12_deploy.jar -CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_2.12_deploy.jar -ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_2.12_deploy.jar -COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_2.12_deploy.jar -VERSIONJAR=----workspace_spark_3_0--core--libcore_generated_resources.jar -VERSION=3.0.0 +SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar +CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar +ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar +COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar mvn install:install-file \ -Dmaven.repo.local=$M2DIR \ -Dfile=$JARDIR/$COREJAR \ -DgroupId=org.apache.spark \ - -DartifactId=spark-core_2.12 \ - -Dversion=$VERSION \ + -DartifactId=spark-core_$SCALA_VERSION \ + -Dversion=$SPARK_VERSION \ -Dpackaging=jar mvn install:install-file \ -Dmaven.repo.local=$M2DIR \ -Dfile=$JARDIR/$CATALYSTJAR \ -DgroupId=org.apache.spark \ - -DartifactId=spark-catalyst_2.12 \ - -Dversion=$VERSION \ + -DartifactId=spark-catalyst_$SCALA_VERSION \ + -Dversion=$SPARK_VERSION \ -Dpackaging=jar mvn install:install-file \ -Dmaven.repo.local=$M2DIR \ -Dfile=$JARDIR/$SQLJAR \ -DgroupId=org.apache.spark \ - -DartifactId=spark-sql_2.12 \ - -Dversion=$VERSION \ + -DartifactId=spark-sql_$SCALA_VERSION \ + -Dversion=$SPARK_VERSION \ -Dpackaging=jar mvn install:install-file \ -Dmaven.repo.local=$M2DIR \ -Dfile=$JARDIR/$ANNOTJAR \ -DgroupId=org.apache.spark \ - -DartifactId=spark-annotation_2.12 \ - -Dversion=$VERSION \ + -DartifactId=spark-annotation_$SCALA_VERSION \ + -Dversion=$SPARK_VERSION \ -Dpackaging=jar -mvn install:install-file \ - -Dmaven.repo.local=$M2DIR \ - -Dfile=$JARDIR/$VERSIONJAR \ - -DgroupId=org.apache.spark \ - -DartifactId=spark-version_2.12 \ - -Dversion=$VERSION \ - -Dpackaging=jar - -mvn -Pdatabricks clean verify -DskipTests +mvn -Pdatabricks clean package -DskipTests -# Copy so we pick up new built jar. Note that the jar name rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar has to be -# exactly that because its based on the staticly setup Databricks cluster we use. That cluster specifically -# installs the jar with the name rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar. Do not change that name -# without changing the Databricks cluster setup. -sudo cp dist/target/rapids-4-spark_2.12-*-SNAPSHOT.jar /databricks/jars/rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar +# Copy so we pick up new built jar and latesty CuDF jar. Note that the jar names has to be +# exactly what is in the staticly setup Databricks cluster we use. +sudo cp dist/target/$RAPIDS_BUILT_JAR $DB_RAPIDS_JAR_LOC +sudo cp ./$CUDF_JAR $DB_CUDF_JAR_LOC # tests export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH sudo /databricks/conda/envs/databricks-ml-gpu/bin/pip install pytest sre_yield cd /home/ubuntu/spark-rapids/integration_tests export SPARK_HOME=/databricks/spark +# change to not point at databricks confs so we don't conflict with their settings +export SPARK_CONF_DIR=$PWD export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true sudo chmod 777 /databricks/data/logs/ sudo chmod 777 /databricks/data/logs/* echo { \"port\":\"15002\" } > ~/.databricks-connect -$SPARK_HOME/bin/spark-submit ./runtests.py 2>&1 | tee out - +$SPARK_HOME/bin/spark-submit ./runtests.py --runtime_env="databricks" cd /home/ubuntu tar -zcvf spark-rapids-built.tgz spark-rapids diff --git a/jenkins/databricks/dbimports.patch b/jenkins/databricks/dbimports.patch index e1b179f49bb..d112e85ec7b 100644 --- a/jenkins/databricks/dbimports.patch +++ b/jenkins/databricks/dbimports.patch @@ -1,20 +1,20 @@ diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala -index e6290d1..6ceb47d 100644 +index e6c3e37..ddd8ca4 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala -@@ -18,8 +18,9 @@ package com.nvidia.spark.rapids - import ai.rapids.cudf.{NvtxColor, Table} +@@ -19,8 +19,9 @@ import ai.rapids.cudf.{NvtxColor, Table} + import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} - import org.apache.spark.sql.catalyst.plans.{Inner, JoinType, LeftAnti, LeftOuter, LeftSemi} + import org.apache.spark.sql.catalyst.plans.{ExistenceJoin, FullOuter, Inner, InnerLike, JoinType, LeftAnti, LeftExistence, LeftOuter, LeftSemi, RightOuter} -import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, HashJoin} +import org.apache.spark.sql.execution.joins.HashJoin import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala -index 5be8d1a..6b4a58e 100644 +index 7ae310b..3ebde77 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala @@ -22,10 +22,11 @@ import org.apache.spark.TaskContext @@ -31,7 +31,7 @@ index 5be8d1a..6b4a58e 100644 import org.apache.spark.sql.vectorized.ColumnarBatch diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala -index 3b2b447..e9e4051 100644 +index 29ba63d..78febd4 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala @@ -17,8 +17,9 @@ @@ -65,7 +65,7 @@ index b02182a..1ed13d2 100644 trait ConfKeysAndIncompat { diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala -index 7be40e8..13c8500 100644 +index ac444d1..14a8c6e 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala @@ -22,6 +22,7 @@ import com.nvidia.spark.rapids.GpuMetricNames._ diff --git a/jenkins/databricks/deploy.sh b/jenkins/databricks/deploy.sh index c783565ffab..04fec64456a 100755 --- a/jenkins/databricks/deploy.sh +++ b/jenkins/databricks/deploy.sh @@ -24,6 +24,6 @@ cd spark-rapids echo "Maven mirror is $MVN_URM_MIRROR" SERVER_ID='snapshots' SERVER_URL='https://urm.nvidia.com:443/artifactory/sw-spark-maven-local' -FPATH=./dist/target/rapids-4-spark_2.12-0.2.0-databricks-SNAPSHOT.jar +FPATH=./dist/target/rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar mvn -B deploy:deploy-file $MVN_URM_MIRROR -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \ -Dfile=$FPATH -DpomFile=dist/pom.xml diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py index 14ab038c420..975cbe8601d 100644 --- a/jenkins/databricks/run-tests.py +++ b/jenkins/databricks/run-tests.py @@ -17,6 +17,7 @@ import getopt import time import os +import subprocess def cluster_state(workspace, clusterid, token): clusterresp = requests.get(workspace + "/api/2.0/clusters/get?cluster_id=%s" % clusterid, headers={'Authorization': 'Bearer %s' % token}) @@ -40,23 +41,30 @@ def main(): clusterid = '0617-140138-umiak14' private_key_file = "~/.ssh/id_rsa" skip_start = None - local_script = "build.sh" - script_dest = "/home/ubuntu/build.sh" - source_tgz = "spark-rapids-ci.tgz" - tgz_dest = "/home/ubuntu/spark-rapids-ci.tgz" + local_script = 'build.sh' + script_dest = '/home/ubuntu/build.sh' + source_tgz = 'spark-rapids-ci.tgz' + tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz' + ci_rapids_jar = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar' + db_version = '0.1-databricks-SNAPSHOT' + scala_version = '2.12' + spark_version = '3.0.0' + cudf_version = '0.15-SNAPSHOT' + cuda_version = 'cuda10-1' + ci_cudf_jar = 'cudf-0.14-cuda10-1.jar' try: - opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:', - ['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=']) + opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:j:b:k:a:f:u:m:', + ['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=', 'cirapidsjar=', 'databricksversion=', 'sparkversion=', 'scalaversion=', 'cudfversion=', 'cudaversion=', 'cicudfjar=']) except getopt.GetoptError: print( - 'run-tests.py -s -t -c -p -n -l -z -z ') + 'run-tests.py -s -t -c -p -n -l -d -z -j -b -k -a -f -u -m ') sys.exit(2) for opt, arg in opts: if opt == '-h': print( - 'run-tests.py -s -t -c -p -n -l -d , -z ') + 'run-tests.py -s -t -c -p -n -l -d , -z -j -b -k -a -f -u -m ') sys.exit() elif opt in ('-s', '--workspace'): workspace = arg @@ -74,6 +82,20 @@ def main(): script_dest = arg elif opt in ('-z', '--sparktgz'): source_tgz = arg + elif opt in ('-j', '--cirapidsjar'): + ci_rapids_jar = arg + elif opt in ('-b', '--databricksversion'): + db_version = arg + elif opt in ('-k', '--sparkversion'): + spark_version = arg + elif opt in ('-a', '--scalaversion'): + scala_version = arg + elif opt in ('-f', '--cudfversion'): + cudf_version = arg + elif opt in ('-u', '--cudaversion'): + cuda_version = arg + elif opt in ('-m', '--cicudfjar'): + ci_cudf_jar = arg print('-s is ' + workspace) print('-c is ' + clusterid) @@ -84,6 +106,14 @@ def main(): print("-n: don't skip start") print('-l is ' + local_script) print('-d is ' + script_dest) + print('-z is ' + source_tgz) + print('-j is ' + ci_rapids_jar) + print('-b is ' + db_version) + print('-k is ' + spark_version) + print('-a is ' + scala_version) + print('-f is ' + cudf_version) + print('-u is ' + cuda_version) + print('-m is ' + ci_cudf_jar) if skip_start is None: jsonout = cluster_state(workspace, clusterid, token) @@ -124,20 +154,21 @@ def main(): print("Copying script") rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, local_script, master_addr, script_dest) print("rsync command: %s" % rsync_command) - os.system(rsync_command) + subprocess.check_call(rsync_command, shell = True) print("Copying source") rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (private_key_file, source_tgz, master_addr, tgz_dest) print("rsync command: %s" % rsync_command) - os.system(rsync_command) - ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s 2>&1 | tee buildout" % (master_addr, private_key_file, script_dest, tgz_dest) + subprocess.check_call(rsync_command, shell = True) + + ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, db_version, scala_version, ci_rapids_jar, spark_version, cudf_version, cuda_version, ci_cudf_jar) print("ssh command: %s" % ssh_command) - os.system(ssh_command) + subprocess.check_call(ssh_command, shell = True) print("Copying built tarball back") rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (private_key_file, master_addr) print("rsync command to get built tarball: %s" % rsync_command) - os.system(rsync_command) + subprocess.check_call(rsync_command, shell = True) if __name__ == '__main__': main() diff --git a/jenkins/databricks/shutdown.py b/jenkins/databricks/shutdown.py index 330ecbe9107..1ee91af16ed 100644 --- a/jenkins/databricks/shutdown.py +++ b/jenkins/databricks/shutdown.py @@ -31,7 +31,7 @@ def main(): clusterid = '0617-140138-umiak14' try: - opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c', + opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:', ['workspace=', 'token=', 'clusterid=']) except getopt.GetoptError: print( diff --git a/pom.xml b/pom.xml index 5949a616f94..1cdb786df50 100644 --- a/pom.xml +++ b/pom.xml @@ -124,6 +124,9 @@ databricks + + true + @@ -148,6 +151,7 @@ UTF-8 UTF-8 not qarun + false @@ -420,6 +424,9 @@ org.apache.rat apache-rat-plugin 0.13 + + ${rat.consoleOutput} + verify @@ -473,6 +480,7 @@ .gnupg/** pom.xml.asc jenkins/databricks/*.patch + *.jar