From 05cd5a7fb5bd121851bd18dab0a090ab7bcd0f1e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex4zhang@gmail.com>
Date: Tue, 6 Jul 2021 15:58:10 +0800
Subject: [PATCH] Test parallel pre-merge build

Signed-off-by: Alex Zhang <alex4zhang@gmail.com>
---
 jenkins/Jenkinsfile                  | 106 ++++++++++++++++++++++++++
 jenkins/Jenkinsfile-blossom.premerge | 108 ++++++++++++++++++++-------
 jenkins/spark-premerge-build-ut.sh   |  72 ++++++++++++++++++
 jenkins/spark-premerge-build.sh      |   2 +-
 4 files changed, 260 insertions(+), 28 deletions(-)
 create mode 100644 jenkins/Jenkinsfile
 create mode 100755 jenkins/spark-premerge-build-ut.sh

diff --git a/jenkins/Jenkinsfile b/jenkins/Jenkinsfile
new file mode 100644
index 000000000000..a0ff5a092b3f
--- /dev/null
+++ b/jenkins/Jenkinsfile
@@ -0,0 +1,106 @@
+#!/usr/local/env groovy
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * Jenkinsfile to watch EMR cluster on AWS. If running more than threshold will generate email notification.
+ */
+@Library(['shared-libs', 'blossom-lib']) _
+
+def IMAGE = "${ArtifactoryConstants.ARTIFACTORY_NAME}/sw-spark-docker/plugin:dev-ubuntu18-cuda11.0-blossom-dev"
+
+pipeline {
+    agent {
+        kubernetes {
+            label "test-parallel-${BUILD_NUMBER}"
+            cloud 'sc-ipp-blossom-prod'
+        }
+    }
+
+    options {
+        ansiColor('xterm')
+        timeout(time: 2, unit: 'HOURS')
+        buildDiscarder(logRotator(numToKeepStr: '20'))
+    }
+
+    parameters {
+        string(name: 'GPU_POOL', defaultValue: 'RESERVED_POOL', description: 'GPU pool name')
+    }
+
+    environment {
+        MVN_URM_MIRROR = '-s jenkins/settings.xml -P mirror-apache-to-urm'
+        LIBCUDF_KERNEL_CACHE_PATH = '/tmp/.cudf'
+        URM_URL = "https://${ArtifactoryConstants.ARTIFACTORY_NAME}/artifactory/sw-spark-maven"
+        CUDA_CLASSIFIER = 'cuda11'
+    }
+
+    stages {
+        stage('Test Parallel for Pre-merge') {
+            parallel {
+                stage('Integration Test') {
+                    options {
+                        lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
+                    }
+                    agent {
+                        kubernetes {
+                            label "test-parallel-it-${BUILD_TAG}"
+                            cloud 'sc-ipp-blossom-prod'
+                            yaml pod.getGPUYAML("${IMAGE}", "${env.GPU_RESOURCE}", '8', '32Gi')
+                        }
+                    }
+
+                    steps {
+                        container('gpu') {
+                            script {
+                                sh "cat /proc/cpuinfo; cat /proc/meminfo"
+                                sh "jenkins/spark-premerge-build.sh"
+                                step([$class                : 'JacocoPublisher',
+                                      execPattern           : '**/target/jacoco.exec',
+                                      classPattern          : 'target/jacoco_classes/',
+                                      sourcePattern         : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/',
+                                      sourceInclusionPattern: '**/*.java,**/*.scala'
+                                ])
+                            }
+                        }
+                    }
+                }
+
+                stage('Unit Test') {
+                    options {
+                        lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
+                    }
+                    agent {
+                        kubernetes {
+                            label "test-parallel-ut-${BUILD_TAG}"
+                            cloud 'sc-ipp-blossom-prod'
+                            yaml pod.getGPUYAML("${IMAGE}", "${env.GPU_RESOURCE}", '8', '32Gi')
+                        }
+                    }
+
+                    steps {
+                        container('gpu') {
+                            script {
+                                sh "cat /proc/cpuinfo; cat /proc/meminfo"
+                                sh "jenkins/spark-premerge-build-ut.sh"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge
index 10ac62f39f08..9f9a912c93a9 100644
--- a/jenkins/Jenkinsfile-blossom.premerge
+++ b/jenkins/Jenkinsfile-blossom.premerge
@@ -22,7 +22,8 @@
  */
 
 @Library(['shared-libs', 'blossom-lib']) _
-@Library('blossom-github-lib@master')
+// @Library('blossom-github-lib@master')
+@Library('blossom-github-lib-alex@nvbug-3339178')
 import ipp.blossom.*
 
 def githubHelper // blossom github helper
@@ -151,33 +152,81 @@ pipeline {
                     !skipped
                 }
             }
-            options {
-                // We have to use params to pass the resource label in options block,
-                // this is a limitation of declarative pipeline. And we need to lock resource before agent start
-                lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
-            }
-            agent {
-                kubernetes {
-                    label "premerge-test-${BUILD_TAG}"
-                    cloud 'sc-ipp-blossom-prod'
-                    yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
-                    workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
-                    customWorkspace "${CUSTOM_WORKSPACE}"
+
+            failFast true
+            parallel {
+                stage('Integration Test') {
+                    options {
+                        // We have to use params to pass the resource label in options block,
+                        // this is a limitation of declarative pipeline. And we need to lock resource before agent start
+                        lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
+                    }
+                    agent {
+                        kubernetes {
+                            label "premerge-test-it-${BUILD_TAG}"
+                            cloud 'sc-ipp-blossom-prod'
+                            yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
+                            workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
+                            customWorkspace "${CUSTOM_WORKSPACE}"
+                        }
+                    }
+
+                    steps {
+                        script {
+                            container('gpu') {
+                                // TODO: improve resource management
+                                timeout(time: 4, unit: 'HOURS') { // step only timeout for test run
+                                    sh "$PREMERGE_SCRIPT"
+                                    step([$class              : 'JacocoPublisher',
+                                        execPattern           : '**/target/jacoco.exec',
+                                        classPattern          : 'target/jacoco_classes/',
+                                        sourcePattern         : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/',
+                                        sourceInclusionPattern: '**/*.java,**/*.scala'
+                                    ])
+                                }
+                            }
+                        }
+                    }
                 }
-            }
 
-            steps {
-                script {
-                    container('gpu') {
-                        // TODO: improve resource management
-                        timeout(time: 4, unit: 'HOURS') { // step only timeout for test run
-                            sh "$PREMERGE_SCRIPT"
-                            step([$class                : 'JacocoPublisher',
-                                  execPattern           : '**/target/jacoco.exec',
-                                  classPattern          : 'target/jacoco_classes/',
-                                  sourcePattern         : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/',
-                                  sourceInclusionPattern: '**/*.java,**/*.scala'
-                            ])
+                stage('Unit Test') {
+                    options {
+                        // We have to use params to pass the resource label in options block,
+                        // this is a limitation of declarative pipeline. And we need to lock resource before agent start
+                        lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
+                    }
+                    agent {
+                        kubernetes {
+                            label "premerge-test-ut-${BUILD_TAG}"
+                            cloud 'sc-ipp-blossom-prod'
+                            yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
+                            workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
+                            customWorkspace "${CUSTOM_WORKSPACE}-ut"
+                        }
+                    }
+
+                    steps {
+                        script {
+                            checkout(
+                                changelog: false,
+                                poll: true,
+                                scm: [
+                                        $class: 'GitSCM', branches: [[name: githubHelper.getMergedSHA()]],
+                                        doGenerateSubmoduleConfigurations: false,
+                                        submoduleCfg: [],
+                                        userRemoteConfigs: [[
+                                            credentialsId: 'github-token',
+                                            url: githubHelper.getCloneUrl(),
+                                            refspec: '+refs/pull/*/merge:refs/remotes/origin/pr/*']]
+                                ]
+                            )
+
+                            container('gpu') {
+                                // TODO: improve resource management
+                                timeout(time: 2, unit: 'HOURS') { // step only timeout for test run
+                                    sh "$JENKINS_ROOT/spark-premerge-build-ut.sh"
+                                }
+                            }
                         }
                     }
                 }
@@ -197,9 +246,14 @@ pipeline {
                 } else {
                     // upload log only in case of build failure
                     def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"]
+
+                    // hide GPU info
+                    guardWords.add("nvidia-smi(?s)(.*?)(?=jenkins/version-def.sh)")
+
                     def logPattern = "### BEGIN OF TEST LOG ###.*### END OF TEST LOG ###"
 
-                    githubHelper.uploadPartialLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords, logPattern)
+                    // githubHelper.uploadPartialLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords, logPattern)
+                    githubHelper.uploadParallelLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords)
 
                     githubHelper.updateCommitStatus("$BUILD_URL", "Fail", GitHubCommitState.FAILURE)
                 }
diff --git a/jenkins/spark-premerge-build-ut.sh b/jenkins/spark-premerge-build-ut.sh
new file mode 100755
index 000000000000..aea5c7bc3aef
--- /dev/null
+++ b/jenkins/spark-premerge-build-ut.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -ex
+
+nvidia-smi
+
+function on_exit {
+    echo '### END OF TEST LOG ###'
+}
+trap on_exit EXIT
+
+echo '### BEGIN OF TEST LOG ###'
+
+. jenkins/version-def.sh
+
+# get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE"
+# BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }')
+# file size check for pull request. The size of a committed file should be less than 1.5MiB
+# pre-commit run check-added-large-files --from-ref $BASE_REF --to-ref HEAD
+
+ARTF_ROOT="$WORKSPACE/.download"
+MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
+    $MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
+    -Ddest=$ARTF_ROOT"
+
+rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT
+
+# Download a full version of spark
+$MVN_GET_CMD \
+    -DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz
+
+# export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
+# export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
+# tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
+#     rm -f $SPARK_HOME.tgz
+
+# mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \
+    # -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CUDA_CLASSIFIER
+# Run the unit tests for other Spark versions but dont run full python integration tests
+# NOT ALL TESTS NEEDED FOR PREMERGE
+# Test latest stable and snapshot shims for a spark minor versions. All others shims test should be covered in nightly pipelines
+env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark304tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+# Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
+#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+
+# The jacoco coverage should have been collected, but because of how the shade plugin
+# works and jacoco we need to clean some things up so jacoco will only report for the
+# things we care about
+# mkdir -p target/jacoco_classes/
+# FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
+# pushd target/jacoco_classes/
+# jar xf $FILE
+# rm -rf com/nvidia/shaded/ org/openucx/
+# popd
diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 29e219a86968..fa6adbdd6309 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -55,7 +55,7 @@ mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TE
 # NOT ALL TESTS NEEDED FOR PREMERGE
 # Just test one 3.0.X version (base version covers this) and one 3.1.X version.
 # All others shims test should be covered in nightly pipelines
-env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+# env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
 # Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
 #env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER