Use a bundled spark-rapids-jni dependency instead of external cudf de…

…pendency (#5249) Signed-off-by: Jason Lowe <jlowe@nvidia.com>
NVIDIA · Apr 18, 2022 · ddb83b2 · ddb83b2
1 parent 8e353ff
commit ddb83b2
Show file tree

Hide file tree

Showing 28 changed files with 116 additions and 207 deletions.
diff --git a/api_validation/README.md b/api_validation/README.md
@@ -11,7 +11,7 @@ Validation fails when:
 
 # Dependencies
 
-It requires cudf, rapids-4-spark and spark jars.
+It requires spark-rapids-jni, rapids-4-spark and Spark jars.
 
 # Running the script
 

diff --git a/api_validation/pom.xml b/api_validation/pom.xml
@@ -109,8 +109,8 @@
             <scope>test</scope>
         </dependency>
         <dependency>
-            <groupId>ai.rapids</groupId>
-            <artifactId>cudf</artifactId>
+            <groupId>com.nvidia</groupId>
+            <artifactId>spark-rapids-jni</artifactId>
             <classifier>${cuda.version}</classifier>
             <scope>provided</scope>
         </dependency>
@@ -126,6 +126,7 @@
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
             <version>${project.version}</version>
+            <classifier>${cuda.version}</classifier>
             <scope>provided</scope>
         </dependency>
     </dependencies>

diff --git a/build/dependency-info.sh b/build/dependency-info.sh
diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml
@@ -150,6 +150,12 @@
             </condition>
         </fail>
 
+        <!-- Remove the explicily unshimmed files from the common directory -->
+        <delete>
+            <fileset dir="${project.build.directory}/parallel-world/spark3xx-common"
+                     includesfile="${project.basedir}/unshimmed-common-from-spark311.txt"/>
+        </delete>
+
         <echo level="info">Generating dependency-reduced-pom.xml</echo>
         <resources id="aggregatorDependencyRegexWithoutWhitespace">
             <string>&lt;dependency&gt;</string>

diff --git a/dist/pom.xml b/dist/pom.xml
@@ -36,17 +36,6 @@
             <classifier>${spark.version.classifier}</classifier>
             <scope>compile</scope>
         </dependency>
-
-        <!--
-            manually promoting provided cudf as a direct dependency
-        -->
-        <dependency>
-            <groupId>ai.rapids</groupId>
-            <artifactId>cudf</artifactId>
-            <version>${cudf.version}</version>
-            <classifier>${cuda.version}</classifier>
-            <scope>compile</scope>
-        </dependency>
     </dependencies>
 
     <properties>
@@ -228,6 +217,9 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <classifier>${cuda.version}</classifier>
+                </configuration>
                 <executions>
                     <execution>
                         <id>default-jar</id>
@@ -241,6 +233,7 @@
                         </goals>
                         <configuration>
                             <classesDirectory>${project.build.directory}/parallel-world</classesDirectory>
+                            <classifier>${cuda.version}</classifier>
                             <excludes>
                                 <!-- get rid of all maven poms from shim builds -->
                                 <exclude>META-INF/maven/**</exclude>

diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,21 +55,21 @@ export SPARK3XX_COMMON_DIR="$PWD/spark3xx-common"
 echo "Retrieving class files hashing to a single value ..."
 
 
-echo "$((++STEP))/ SHA1 of all classes > tmp-sha1-class.txt"
-find ./parallel-world/spark3* -type f -name '*.class' | \
-  xargs $SHASUM > tmp-sha1-class.txt
+echo "$((++STEP))/ SHA1 of all non-META files > tmp-sha1-files.txt"
+find ./parallel-world/spark3* -name META-INF -prune -o \( -type f -print \) | \
+  xargs $SHASUM > tmp-sha1-files.txt
 
-echo "$((++STEP))/ make shim column 1 > tmp-shim-sha-package-class.txt"
-< tmp-sha1-class.txt awk -F/ '$1=$1' | \
+echo "$((++STEP))/ make shim column 1 > tmp-shim-sha-package-files.txt"
+< tmp-sha1-files.txt awk -F/ '$1=$1' | \
   awk '{checksum=$1; shim=$4; $1=shim; $2=$3=""; $4=checksum;  print $0}' | \
-  tr -s  ' ' > tmp-shim-sha-package-class.txt
+  tr -s  ' ' > tmp-shim-sha-package-files.txt
 
-echo "$((++STEP))/ sort by path, sha1; output first from each group > tmp-count-shim-sha-package-class.txt"
-sort -k3 -k2,2 -u tmp-shim-sha-package-class.txt | \
-  uniq -f 2 -c > tmp-count-shim-sha-package-class.txt
+echo "$((++STEP))/ sort by path, sha1; output first from each group > tmp-count-shim-sha-package-files.txt"
+sort -k3 -k2,2 -u tmp-shim-sha-package-files.txt | \
+  uniq -f 2 -c > tmp-count-shim-sha-package-files.txt
 
-echo "$((++STEP))/ class files with unique sha1 > $SPARK3XX_COMMON_TXT"
-grep '^\s\+1 .*' tmp-count-shim-sha-package-class.txt | \
+echo "$((++STEP))/ files with unique sha1 > $SPARK3XX_COMMON_TXT"
+grep '^\s\+1 .*' tmp-count-shim-sha-package-files.txt | \
   awk '{$1=""; $3=""; print $0 }' | \
   tr -s ' ' | sed 's/\ /\//g' > "$SPARK3XX_COMMON_TXT"
 

diff --git a/dist/unshimmed-common-from-spark311.txt b/dist/unshimmed-common-from-spark311.txt
@@ -2,6 +2,8 @@ META-INF/DEPENDENCIES
 META-INF/LICENSE
 META-INF/NOTICE
 META-INF/maven/**
+ai/rapids/**
+amd64/Linux/**
 com/nvidia/spark/ExclusiveModeGpuDiscoveryPlugin*
 com/nvidia/spark/GpuCachedBatchSerializer*
 com/nvidia/spark/ParquetCachedBatchSerializer*
@@ -25,9 +27,11 @@ com/nvidia/spark/rapids/SparkShimServiceProvider*
 com/nvidia/spark/rapids/SparkShimVersion*
 com/nvidia/spark/rapids/SparkShims*
 com/nvidia/spark/udf/Plugin*
+cudf-java-version-info.properties
 libjucx.so
 org/apache/spark/sql/rapids/ProxyRapidsShuffleInternalManagerBase*
 org/apache/spark/sql/rapids/VisibleShuffleManager*
 org/openucx/**
 rapids/*.py
 rapids4spark-version-info.properties
+spark-rapids-jni-version-info.properties
diff --git a/docs/configs.md b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
 On startup use: `--conf [conf key]=[conf value]`. For example:
 
 ```
-${SPARK_HOME}/bin/spark --jars 'rapids-4-spark_2.12-22.06.0-SNAPSHOT.jar,cudf-22.06.0-SNAPSHOT-cuda11.jar' \
+${SPARK_HOME}/bin/spark --jars rapids-4-spark_2.12-22.06.0-SNAPSHOT-cuda11.jar \
 --conf spark.plugins=com.nvidia.spark.SQLPlugin \
 --conf spark.rapids.sql.incompatibleOps.enabled=true
 ```

diff --git a/docs/demo/Databricks/generate-init-script.ipynb b/docs/demo/Databricks/generate-init-script.ipynb
@@ -3,7 +3,7 @@
       {
          "cell_type":"code",
          "source":[
-            "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-22.04.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.04.0/rapids-4-spark_2.12-22.04.0.jar\nsudo wget -O /databricks/jars/cudf-22.04.0-cuda11.jar https://repo1.maven.org/maven2/ai/rapids/cudf/22.04.0/cudf-22.04.0-cuda11.jar\"\"\", True)"
+            "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-22.06.0-cuda11.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0-cuda11.jar\n\"\"\", True)"
          ],
          "metadata":{
 

diff --git a/docs/get-started/Dockerfile.cuda b/docs/get-started/Dockerfile.cuda
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -51,7 +51,6 @@ COPY spark/examples /opt/spark/examples
 COPY spark/kubernetes/tests /opt/spark/tests
 COPY spark/data /opt/spark/data
 
-COPY cudf-*-cuda11.jar /opt/sparkRapidsPlugin
 COPY rapids-4-spark_2.12-*.jar /opt/sparkRapidsPlugin
 COPY getGpusResources.sh /opt/sparkRapidsPlugin
 

diff --git a/docs/get-started/getting-started-kubernetes.md b/docs/get-started/getting-started-kubernetes.md
@@ -41,9 +41,9 @@ On a client machine which has access to the Kubernetes cluster:
 2. Download the [RAPIDS Accelerator for Spark jars](getting-started-on-prem.md#download-the-rapids-jars) and the
   [GPU discovery script](getting-started-on-prem.md#install-the-gpu-discovery-script).
 
-   Put the 2 jars -- `rapids-4-spark_<version>.jar`, `cudf-<version>.jar`  and `getGpusResources.sh` in the same directory as `spark`.
+   Put `rapids-4-spark_<version>.jar` and `getGpusResources.sh` in the same directory as `spark`.
 
-   Note: If here you decide to put above 2 jars in the `spark/jars` directory which will be copied into 
+   Note: If here you decide to put above jar in the `spark/jars` directory which will be copied into
    `/opt/spark/jars` directory in Docker image, then in the future you do not need to 
    specify `spark.driver.extraClassPath` or `spark.executor.extraClassPath` using `cluster` mode.
    This example just shows you a way to put customized jars or 3rd party jars.
@@ -61,7 +61,7 @@ On a client machine which has access to the Kubernetes cluster:
    Currently the directory in the local machine should look as below:
    ```shell 
    $ ls
-   Dockerfile.cuda   cudf-<version>.jar   getGpusResources.sh   rapids-4-spark_<version>.jar   spark
+   Dockerfile.cuda   getGpusResources.sh   rapids-4-spark_<version>.jar   spark
    ```
 
 4. Build the Docker image with a proper repository name and tag and push it to the repository
@@ -113,8 +113,8 @@ $SPARK_HOME/bin/spark-submit \
      --conf spark.executor.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh \
      --conf spark.executor.resource.gpu.vendor=nvidia.com \
      --conf spark.kubernetes.container.image=$IMAGE_NAME \
-     --conf spark.executor.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar:/opt/sparkRapidsPlugin/cudf-<version>.jar   \
-     --conf spark.driver.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar:/opt/sparkRapidsPlugin/cudf-<version>.jar   \
+     --conf spark.executor.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar \
+     --conf spark.driver.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar \
      --driver-memory 2G \
      local:///opt/spark/examples/jars/spark-examples_2.12-3.0.2.jar
 ```
@@ -177,8 +177,8 @@ $SPARK_HOME/bin/spark-shell \
      --conf spark.executor.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh \
      --conf spark.executor.resource.gpu.vendor=nvidia.com \
      --conf spark.kubernetes.container.image=$IMAGE_NAME \
-     --conf spark.executor.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar:/opt/sparkRapidsPlugin/cudf-<version>.jar   \
-     --driver-class-path=./cudf-<version>.jar:./rapids-4-spark_<version>.jar \
+     --conf spark.executor.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar \
+     --driver-class-path=./rapids-4-spark_<version>.jar \
      --driver-memory 2G 
 ```
 
@@ -244,9 +244,9 @@ $SPARK_HOME/bin/spark-submit \
      --conf spark.executor.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh \
      --conf spark.executor.resource.gpu.vendor=nvidia.com \
      --conf spark.kubernetes.container.image=$IMAGE_NAME \
-     --conf spark.executor.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar:/opt/sparkRapidsPlugin/cudf-<version>.jar   \
+     --conf spark.executor.extraClassPath=/opt/sparkRapidsPlugin/rapids-4-spark_<version>.jar \
      --driver-memory 2G \
-     --driver-class-path=./cudf-<version>.jar:./rapids-4-spark_<version>.jar \
+     --driver-class-path=./rapids-4-spark_<version>.jar \
      test.py
 ```
 
@@ -304,8 +304,8 @@ Using Spark Operator is another way to submit Spark Applications into a Kubernet
        "spark.plugins": "com.nvidia.spark.SQLPlugin"
        "spark.executor.resource.gpu.discoveryScript": "/opt/sparkRapidsPlugin/getGpusResources.sh"
        "spark.executor.resource.gpu.vendor": "nvidia.com"
-       "spark.executor.extraClassPath": "/opt/sparkRapidsPlugin/rapids-4-spark.jar:/opt/sparkRapidsPlugin/cudf.jar"
-       "spark.driver.extraClassPath": "/opt/sparkRapidsPlugin/rapids-4-spark.jar:/opt/sparkRapidsPlugin/cudf.jar"
+       "spark.executor.extraClassPath": "/opt/sparkRapidsPlugin/rapids-4-spark.jar"
+       "spark.driver.extraClassPath": "/opt/sparkRapidsPlugin/rapids-4-spark.jar"
      type: Python
      pythonVersion: 3
      mode: cluster