From 05cd5a7fb5bd121851bd18dab0a090ab7bcd0f1e Mon Sep 17 00:00:00 2001 From: Alex Zhang Date: Tue, 6 Jul 2021 15:58:10 +0800 Subject: [PATCH] Test parallel pre-merge build Signed-off-by: Alex Zhang --- jenkins/Jenkinsfile | 106 ++++++++++++++++++++++++++ jenkins/Jenkinsfile-blossom.premerge | 108 ++++++++++++++++++++------- jenkins/spark-premerge-build-ut.sh | 72 ++++++++++++++++++ jenkins/spark-premerge-build.sh | 2 +- 4 files changed, 260 insertions(+), 28 deletions(-) create mode 100644 jenkins/Jenkinsfile create mode 100755 jenkins/spark-premerge-build-ut.sh diff --git a/jenkins/Jenkinsfile b/jenkins/Jenkinsfile new file mode 100644 index 000000000000..a0ff5a092b3f --- /dev/null +++ b/jenkins/Jenkinsfile @@ -0,0 +1,106 @@ +#!/usr/local/env groovy +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * Jenkinsfile to watch EMR cluster on AWS. If running more than threshold will generate email notification. + */ +@Library(['shared-libs', 'blossom-lib']) _ + +def IMAGE = "${ArtifactoryConstants.ARTIFACTORY_NAME}/sw-spark-docker/plugin:dev-ubuntu18-cuda11.0-blossom-dev" + +pipeline { + agent { + kubernetes { + label "test-parallel-${BUILD_NUMBER}" + cloud 'sc-ipp-blossom-prod' + } + } + + options { + ansiColor('xterm') + timeout(time: 2, unit: 'HOURS') + buildDiscarder(logRotator(numToKeepStr: '20')) + } + + parameters { + string(name: 'GPU_POOL', defaultValue: 'RESERVED_POOL', description: 'GPU pool name') + } + + environment { + MVN_URM_MIRROR = '-s jenkins/settings.xml -P mirror-apache-to-urm' + LIBCUDF_KERNEL_CACHE_PATH = '/tmp/.cudf' + URM_URL = "https://${ArtifactoryConstants.ARTIFACTORY_NAME}/artifactory/sw-spark-maven" + CUDA_CLASSIFIER = 'cuda11' + } + + stages { + stage('Test Parallel for Pre-merge') { + parallel { + stage('Integration Test') { + options { + lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE') + } + agent { + kubernetes { + label "test-parallel-it-${BUILD_TAG}" + cloud 'sc-ipp-blossom-prod' + yaml pod.getGPUYAML("${IMAGE}", "${env.GPU_RESOURCE}", '8', '32Gi') + } + } + + steps { + container('gpu') { + script { + sh "cat /proc/cpuinfo; cat /proc/meminfo" + sh "jenkins/spark-premerge-build.sh" + step([$class : 'JacocoPublisher', + execPattern : '**/target/jacoco.exec', + classPattern : 'target/jacoco_classes/', + sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/', + sourceInclusionPattern: '**/*.java,**/*.scala' + ]) + } + } + } + } + + stage('Unit Test') { + options { + lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE') + } + agent { + kubernetes { + label "test-parallel-ut-${BUILD_TAG}" + cloud 'sc-ipp-blossom-prod' + yaml pod.getGPUYAML("${IMAGE}", "${env.GPU_RESOURCE}", '8', '32Gi') + } + } + + steps { + container('gpu') { + script { + sh "cat /proc/cpuinfo; cat /proc/meminfo" + sh "jenkins/spark-premerge-build-ut.sh" + } + } + } + } + } + } + } +} diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge index 10ac62f39f08..9f9a912c93a9 100644 --- a/jenkins/Jenkinsfile-blossom.premerge +++ b/jenkins/Jenkinsfile-blossom.premerge @@ -22,7 +22,8 @@ */ @Library(['shared-libs', 'blossom-lib']) _ -@Library('blossom-github-lib@master') +// @Library('blossom-github-lib@master') +@Library('blossom-github-lib-alex@nvbug-3339178') import ipp.blossom.* def githubHelper // blossom github helper @@ -151,33 +152,81 @@ pipeline { !skipped } } - options { - // We have to use params to pass the resource label in options block, - // this is a limitation of declarative pipeline. And we need to lock resource before agent start - lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE') - } - agent { - kubernetes { - label "premerge-test-${BUILD_TAG}" - cloud 'sc-ipp-blossom-prod' - yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi - workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false) - customWorkspace "${CUSTOM_WORKSPACE}" + + failFast true + parallel { + stage('Integration Test') { + options { + // We have to use params to pass the resource label in options block, + // this is a limitation of declarative pipeline. And we need to lock resource before agent start + lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE') + } + agent { + kubernetes { + label "premerge-test-it-${BUILD_TAG}" + cloud 'sc-ipp-blossom-prod' + yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi + workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false) + customWorkspace "${CUSTOM_WORKSPACE}" + } + } + + steps { + script { + container('gpu') { + // TODO: improve resource management + timeout(time: 4, unit: 'HOURS') { // step only timeout for test run + sh "$PREMERGE_SCRIPT" + step([$class : 'JacocoPublisher', + execPattern : '**/target/jacoco.exec', + classPattern : 'target/jacoco_classes/', + sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/', + sourceInclusionPattern: '**/*.java,**/*.scala' + ]) + } + } + } + } } - } - steps { - script { - container('gpu') { - // TODO: improve resource management - timeout(time: 4, unit: 'HOURS') { // step only timeout for test run - sh "$PREMERGE_SCRIPT" - step([$class : 'JacocoPublisher', - execPattern : '**/target/jacoco.exec', - classPattern : 'target/jacoco_classes/', - sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/', - sourceInclusionPattern: '**/*.java,**/*.scala' - ]) + stage('Unit Test') { + options { + // We have to use params to pass the resource label in options block, + // this is a limitation of declarative pipeline. And we need to lock resource before agent start + lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE') + } + agent { + kubernetes { + label "premerge-test-ut-${BUILD_TAG}" + cloud 'sc-ipp-blossom-prod' + yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi + workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false) + customWorkspace "${CUSTOM_WORKSPACE}-ut" + } + } + + steps { + script { + checkout( + changelog: false, + poll: true, + scm: [ + $class: 'GitSCM', branches: [[name: githubHelper.getMergedSHA()]], + doGenerateSubmoduleConfigurations: false, + submoduleCfg: [], + userRemoteConfigs: [[ + credentialsId: 'github-token', + url: githubHelper.getCloneUrl(), + refspec: '+refs/pull/*/merge:refs/remotes/origin/pr/*']] + ] + ) + + container('gpu') { + // TODO: improve resource management + timeout(time: 2, unit: 'HOURS') { // step only timeout for test run + sh "$JENKINS_ROOT/spark-premerge-build-ut.sh" + } + } } } } @@ -197,9 +246,14 @@ pipeline { } else { // upload log only in case of build failure def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"] + + // hide GPU info + guardWords.add("nvidia-smi(?s)(.*?)(?=jenkins/version-def.sh)") + def logPattern = "### BEGIN OF TEST LOG ###.*### END OF TEST LOG ###" - githubHelper.uploadPartialLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords, logPattern) + // githubHelper.uploadPartialLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords, logPattern) + githubHelper.uploadParallelLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords) githubHelper.updateCommitStatus("$BUILD_URL", "Fail", GitHubCommitState.FAILURE) } diff --git a/jenkins/spark-premerge-build-ut.sh b/jenkins/spark-premerge-build-ut.sh new file mode 100755 index 000000000000..aea5c7bc3aef --- /dev/null +++ b/jenkins/spark-premerge-build-ut.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -ex + +nvidia-smi + +function on_exit { + echo '### END OF TEST LOG ###' +} +trap on_exit EXIT + +echo '### BEGIN OF TEST LOG ###' + +. jenkins/version-def.sh + +# get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE" +# BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }') +# file size check for pull request. The size of a committed file should be less than 1.5MiB +# pre-commit run check-added-large-files --from-ref $BASE_REF --to-ref HEAD + +ARTF_ROOT="$WORKSPACE/.download" +MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \ + $MVN_URM_MIRROR -DremoteRepositories=$URM_URL \ + -Ddest=$ARTF_ROOT" + +rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT + +# Download a full version of spark +$MVN_GET_CMD \ + -DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz + +# export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2" +# export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH" +# tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \ +# rm -f $SPARK_HOME.tgz + +# mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \ + # -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CUDA_CLASSIFIER +# Run the unit tests for other Spark versions but dont run full python integration tests +# NOT ALL TESTS NEEDED FOR PREMERGE +# Test latest stable and snapshot shims for a spark minor versions. All others shims test should be covered in nightly pipelines +env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER +env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark304tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER +env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER +env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER +# Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052 +#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER + +# The jacoco coverage should have been collected, but because of how the shade plugin +# works and jacoco we need to clean some things up so jacoco will only report for the +# things we care about +# mkdir -p target/jacoco_classes/ +# FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f) +# pushd target/jacoco_classes/ +# jar xf $FILE +# rm -rf com/nvidia/shaded/ org/openucx/ +# popd diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 29e219a86968..fa6adbdd6309 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -55,7 +55,7 @@ mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TE # NOT ALL TESTS NEEDED FOR PREMERGE # Just test one 3.0.X version (base version covers this) and one 3.1.X version. # All others shims test should be covered in nightly pipelines -env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER +# env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER # Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052 #env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER