Merge remote-tracking branch 'origin/branch-22.10' into SP-5416

Signed-off-by: Raza Jafri <rjafri@nvidia.com>
NVIDIA · Aug 4, 2022 · 19f9ebc · 19f9ebc
2 parents 1d66c2b + 8b497c5
commit 19f9ebc
Show file tree

Hide file tree

Showing 103 changed files with 16,375 additions and 5,528 deletions.
diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # A workflow to run mvn verify check
-name: Maven verify checks (Scala style, Compile and Doc-gen w/ base Spark version)
+name: mvn[compile,RAT,scalastyle,docgen]
 
 on:
   pull_request:
@@ -24,7 +24,61 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  mvn-verify-check:
+  get-noSnapshot-versions-from-dist:
+    runs-on: ubuntu-latest
+    outputs:
+      sparkHeadVersion: ${{ steps.noSnapshotVersionsStep.outputs.headVersion }}
+      sparkTailVersions: ${{ steps.noSnapshotVersionsStep.outputs.tailVersions }}
+    steps:
+      - uses: actions/checkout@v2 # refs/pull/:prNumber/merge
+
+      - name: Setup Java and Maven Env
+        uses: actions/setup-java@v3
+        with:
+          distribution: adopt
+          java-version: 8
+
+      - name: all noSnapshot versions
+        id: noSnapshotVersionsStep
+        run: |
+          set -x
+          noSnapshotVersionsStr=$(mvn -B help:evaluate -q -pl dist -PnoSnapshots -Dexpression=included_buildvers -DforceStdout)
+          noSnapshotVersionsStr=$(echo $noSnapshotVersionsStr)
+          noSnapshotVersionsArr=($(IFS=", "; echo $noSnapshotVersionsStr))
+          tailNoSnapshotVersionsArr=(${noSnapshotVersionsArr[@]:1})
+          svArrBody=$(printf ",{\"spark-version\":\"%s\"}" "${tailNoSnapshotVersionsArr[@]}")
+          svArrBody=${svArrBody:1}
+          svJsonStr=$(printf {\"include\":[%s]} $svArrBody)
+          echo ::set-output name=headVersion::${noSnapshotVersionsArr[0]}
+          echo ::set-output name=tailVersions::$svJsonStr
+
+  package-aggregator:
+    needs: get-noSnapshot-versions-from-dist
+    strategy:
+      matrix: ${{ fromJSON(needs.get-noSnapshot-versions-from-dist.outputs.sparkTailVersions) }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2 # refs/pull/:prNumber/merge
+
+      - name: Setup Java and Maven Env
+        uses: actions/setup-java@v3
+        with:
+          distribution: adopt
+          java-version: 8
+
+      - name: package aggregator check
+        run: >
+          mvn -B package -pl aggregator -am
+          -P 'individual,pre-merge'
+          -Dbuildver=${{ matrix.spark-version }}
+          -DskipTests
+          -Dskip
+          -Dmaven.javadoc.skip
+          -Dmaven.scalastyle.skip=true
+          -Drat.skip=true
+
+  verify-all-modules-with-headSparkVersion:
+    needs: get-noSnapshot-versions-from-dist
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2 # refs/pull/:prNumber/merge
@@ -36,5 +90,11 @@ jobs:
           java-version: 8
 
       # includes RAT, code style and doc-gen checks of default shim
-      - name: mvn verify check
-        run: mvn verify -P 'individual,pre-merge' -pl dist -am -DskipTests -Dskip -Dmaven.javadoc.skip
+      - name: verify all modules with lowest-supported Spark version
+        run: >
+          mvn -B verify
+          -P 'individual,pre-merge'
+          -Dbuildver=${{ needs.get-noSnapshot-versions-from-dist.outputs.sparkHeadVersion }}
+          -DskipTests
+          -Dskip
+          -Dmaven.javadoc.skip
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 # keep the lines below sorted
 
 # standard .bloop and version-specific .bloop3xy generated by buildall -gb
-.bloop*/
+.bloop*
 .cache
 .classpath
 .DS_Store

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -218,9 +218,21 @@ not clobbered by repeated `bloopInstall` Maven plugin invocations, and it uses
 [jq](https://stedolan.github.io/jq/) to post-process JSON-formatted project files such that they
 compile project classes into non-overlapping set of output directories.
 
+To activate the Spark dependency version 3XY you currently are working with update
+the symlink `.bloop` to point to the corresponding directory `.bloop-spark3XY`
+
+Example usage:
+```Bash
+./build/buildall --generate-bloop --profile=311,330
+rm -vf .bloop
+ln -s .bloop-spark330 .bloop
+```
+
 You can now open the spark-rapids as a
 [BSP project in IDEA](https://www.jetbrains.com/help/idea/bsp-support.html)
 
+Read on for VS Code Scala Metals instructions.
+
 # Bloop, Scala Metals, and Visual Studio Code
 
 _Last tested with 1.63.0-insider (Universal) Commit: bedf867b5b02c1c800fbaf4d6ce09cefba_

diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ as a `provided` dependency.
 <dependency>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark_2.12</artifactId>
-    <version>22.08.0-SNAPSHOT</version>
+    <version>22.10.0-SNAPSHOT</version>
     <scope>provided</scope>
 </dependency>
 ```
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>22.08.0-SNAPSHOT</version>
+        <version>22.10.0-SNAPSHOT</version>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>22.08.0-SNAPSHOT</version>
+    <version>22.10.0-SNAPSHOT</version>
 
     <properties>
         <!--

diff --git a/api_validation/pom.xml b/api_validation/pom.xml
@@ -22,10 +22,10 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>22.08.0-SNAPSHOT</version>
+        <version>22.10.0-SNAPSHOT</version>
     </parent>
     <artifactId>rapids-4-spark-api-validation</artifactId>
-    <version>22.08.0-SNAPSHOT</version>
+    <version>22.10.0-SNAPSHOT</version>
 
     <profiles>
        <profile>

diff --git a/build/buildall b/build/buildall
@@ -22,6 +22,7 @@ shopt -s extglob
 BLOOP_VERSION=${BLOOP_VERSION:-"1.4.13"}
 BLOOP_SCALA_VERSION=${BLOOP_SCALA_VERSION:-"2.13"}
 SKIP_CLEAN=1
+BUILD_ALL_DEBUG=0
 
 function print_usage() {
   echo "Usage: buildall [OPTION]"
@@ -52,18 +53,20 @@ function print_usage() {
 }
 
 function bloopInstall() {
-  BLOOP_DIR="${BLOOP_DIR:-$PWD/.bloop}"
-  mkdir -p $BLOOP_DIR
-  rm -f $BLOOP_DIR/*
 
-  time (
-    for bv in $SPARK_SHIM_VERSIONS; do
-      bloop_config_dir="$PWD/.bloop$bv"
-      mkdir -p "$bloop_config_dir"
-      rm -f "$bloop_config_dir"/*
+  [[ "$BUILD_ALL_DEBUG" == "1" ]] && set -x
+
+  local bloopTmpDir=$(mktemp -d /tmp/tmp.bloop.XXXXXX)
 
-      $MVN install ch.epfl.scala:maven-bloop_${BLOOP_SCALA_VERSION}:${BLOOP_VERSION}:bloopInstall -pl dist -am \
-        -Dbloop.configDirectory="$bloop_config_dir" \
+  time (
+    bloopDirsGenerated=()
+    for bv in "${SPARK_SHIM_VERSIONS[@]}"; do
+      bloopTmpConfigDir="$bloopTmpDir/.bloop$bv"
+      mkdir -p $bloopTmpConfigDir
+      $MVN -B clean install \
+        ch.epfl.scala:maven-bloop_${BLOOP_SCALA_VERSION}:${BLOOP_VERSION}:bloopInstall \
+        -pl aggregator -am \
+        -Dbloop.configDirectory="$bloopTmpConfigDir" \
         -DdownloadSources=true \
         -Dbuildver="$bv" \
         -DskipTests \
@@ -73,22 +76,26 @@ function bloopInstall() {
         -Dmaven.updateconfig.skip=true
 
       specifier="spark$bv"
-      for bloop_json in $(echo $bloop_config_dir/*.json); do
-        IFS="/" <<< "$bloop_json" read -ra bloop_json_parts
-        last_idx=$((${#bloop_json_parts[@]} - 1))
-        file="${bloop_json_parts[$last_idx]}"
-        project="${file%.json}-$specifier"
-        < $bloop_json jq \
-          --arg specifier "$specifier" \
-          '.project.out=.project.out + "/" + $specifier | .project.name=.project.name + "-" + $specifier' \
-           > "$BLOOP_DIR/$project.json"
-      done
+      bloopDir=$PWD/.bloop-$specifier
+      rm -rf $bloopDir
+      mv $bloopTmpConfigDir $bloopDir
+      echo "generated bloop files under $bloopDir"
+      bloopDirsGenerated+=($bloopDir)
     done
-
-    echo "Generated Bloop files under $BLOOP_DIR"
+    echo "#### Created bloop projects ${bloopDirsGenerated[@]}"
+    echo "Execute"
+    echo "  ln -s .bloop-spark3XY .bloop"
+    echo "to make it an active Bloop project in VS Code Scala Metals"
   )
 }
 
+function versionsFromDistProfile() {
+  [[ "$BUILD_ALL_DEBUG" == "1" ]] && set -x
+  versionRawStr=$(mvn -B help:evaluate -q -pl dist -P"$1" -Dexpression=included_buildvers -DforceStdout)
+  versionStr=${versionRawStr//[$'\n',]/}
+  echo -n $versionStr
+}
+
 FINAL_OP="package"
 
 while [[ "$1" != "" ]] ; do
@@ -117,6 +124,7 @@ case "$1" in
   ;;
 
 --debug)
+  BUILD_ALL_DEBUG=1
   set -x
   ;;
 
@@ -150,40 +158,15 @@ DIST_PROFILE=${DIST_PROFILE:-"noSnapshots"}
 case $DIST_PROFILE in
 
   snapshots?(WithDatabricks))
-    SPARK_SHIM_VERSIONS=(
-      311
-      321cdh
-      312
-      313
-      314
-      320
-      321
-      322
-      330
-      331
-    )
+    SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "snapshots"))
     ;;
 
   noSnapshots?(WithDatabricks))
-    SPARK_SHIM_VERSIONS=(
-      311
-      321cdh
-      312
-      313
-      320
-      321
-      322
-      330
-    )
+    SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "noSnapshots"))
     ;;
 
   minimumFeatureVersionMix)
-    SPARK_SHIM_VERSIONS=(
-      321cdh
-      312
-      320
-      330
-    )
+    SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "minimumFeatureVersionMix"))
     ;;
 
   3*)
@@ -198,6 +181,9 @@ case $DIST_PROFILE in
 
 esac
 
+echo "Spark versions involved: ${SPARK_SHIM_VERSIONS[@]} ..."
+export MVN_BASE_DIR=$($MVN help:evaluate -Dexpression=project.basedir -q -DforceStdout)
+
 if [[ "$GEN_BLOOP" == "true" ]]; then
   bloopInstall
   exit 0
@@ -217,10 +203,8 @@ fi
 
 echo "Building a combined dist jar with Shims for ${SPARK_SHIM_VERSIONS[@]} ..."
 
-export MVN_BASE_DIR=$($MVN help:evaluate -Dexpression=project.basedir -q -DforceStdout)
-
 function build_single_shim() {
-  set -x
+  [[ "$BUILD_ALL_DEBUG" == "1" ]] && set -x
   BUILD_VER=$1
   mkdir -p "$MVN_BASE_DIR/target"
   (( BUILD_PARALLEL == 1 || NUM_SHIMS == 1 )) && LOG_FILE="/dev/tty" || \

diff --git a/common/pom.xml b/common/pom.xml
@@ -24,13 +24,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>22.08.0-SNAPSHOT</version>
+        <version>22.10.0-SNAPSHOT</version>
     </parent>
 
     <artifactId>rapids-4-spark-common_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Common</name>
     <description>Utility code that is common across the RAPIDS Accelerator projects</description>
-    <version>22.08.0-SNAPSHOT</version>
+    <version>22.10.0-SNAPSHOT</version>
 
     <dependencies>
         <dependency>

diff --git a/dist/pom.xml b/dist/pom.xml
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>22.08.0-SNAPSHOT</version>
+        <version>22.10.0-SNAPSHOT</version>
     </parent>
     <artifactId>rapids-4-spark_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>22.08.0-SNAPSHOT</version>
+    <version>22.10.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>

diff --git a/dist/unshimmed-from-each-spark3xx.txt b/dist/unshimmed-from-each-spark3xx.txt
@@ -1,6 +1,6 @@
 com/nvidia/spark/rapids/*/RapidsShuffleManager*
 com/nvidia/spark/rapids/AvroProvider.class
 com/nvidia/spark/rapids/HiveProvider.class
-com/nvidia/spark/rapids/IcebergProvider.class
+com/nvidia/spark/rapids/iceberg/IcebergProvider.class
 org/apache/spark/sql/rapids/shims/*/ProxyRapidsShuffleInternalManager*
 spark-*-info.properties
diff --git a/docs/FAQ.md b/docs/FAQ.md
@@ -418,7 +418,7 @@ There are multiple reasons why this a problematic configuration:
 
 Yes, but it requires support from the underlying cluster manager to isolate the MIG GPU instance
 for each executor (e.g.: by setting `CUDA_VISIBLE_DEVICES`,
-[YARN with docker isolation](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.08/examples/MIG-Support)
+[YARN with docker isolation](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.10/examples/MIG-Support)
 or other means).
 
 Note that MIG is not recommended for use with the RAPIDS Accelerator since it significantly

diff --git a/docs/additional-functionality/ml-integration.md b/docs/additional-functionality/ml-integration.md
@@ -40,7 +40,7 @@ access to any of the memory that RMM is holding.
 ## Spark ML Algorithms Supported by RAPIDS Accelerator
 
 The [spark-rapids-examples repository](https://github.com/NVIDIA/spark-rapids-examples) provides a
-[working example](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.08/examples/ML+DL-Examples/Spark-cuML/pca)
+[working example](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.10/examples/ML+DL-Examples/Spark-cuML/pca)
 of accelerating the `transform` API for
 [Principal Component Analysis (PCA)](https://spark.apache.org/docs/latest/mllib-dimensionality-reduction#principal-component-analysis-pca).
 The example leverages the [RAPIDS accelerated UDF interface](rapids-udfs.md) to provide a native

diff --git a/docs/additional-functionality/rapids-udfs.md b/docs/additional-functionality/rapids-udfs.md
@@ -135,7 +135,7 @@ type `DECIMAL64(scale=-2)`.
 ## RAPIDS Accelerated UDF Examples
 
 <!-- Note: should update the branch name to tag when releasing-->
-Source code for examples of RAPIDS accelerated UDFs is provided in the [udf-examples](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.08/examples/UDF-Examples/RAPIDS-accelerated-UDFs) project.
+Source code for examples of RAPIDS accelerated UDFs is provided in the [udf-examples](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.10/examples/UDF-Examples/RAPIDS-accelerated-UDFs) project.
 
 ## GPU Support for Pandas UDF