Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replaced spark3xx-common references to spark-shared [databricks] #11066

Merged
merged 9 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions build/coverage-report
jlowe marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

#
# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,11 +33,11 @@ SOURCE_WITH_ARGS="--sourcefiles "$(echo $SOURCE_DIRS | sed -e 's/:/ --sourcefile
rm -rf "$TMP_CLASS"
mkdir -p "$TMP_CLASS"
pushd "$TMP_CLASS"
jar xf "$DIST_JAR" com org rapids spark3xx-common "spark${SPK_VER}/"
jar xf "$DIST_JAR" com org rapids spark-shared "spark${SPK_VER}/"
# extract the .class files in udf jar and replace the existing ones in spark3xx-ommon and spark$SPK_VER
# because the class files in udf jar will be modified in aggregator's shade phase
jar xf "$UDF_JAR" com/nvidia/spark/udf
rm -rf com/nvidia/shaded/ org/openucx/ spark3xx-common/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/
rm -rf com/nvidia/shaded/ org/openucx/ spark-shared/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/
popd

if [ ! -f "$JDEST" ]; then
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-20x/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-21x/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-22x/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-23x/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-24x/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-spark330db/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-spark332db/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions delta-lake/delta-spark341db/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
jlowe marked this conversation as resolved.
Show resolved Hide resolved
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
5 changes: 0 additions & 5 deletions dist/scripts/binary-dedupe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,6 @@ done

mv "$SPARK_SHARED_DIR" parallel-world/

# TODO further dedupe by FEATURE version lines:
# spark30x-common
# spark31x-common
# spark32x-common

# Verify that all class files in the conventional jar location are bitwise
# identical regardless of the Spark-version-specific jar.
#
Expand Down
30 changes: 15 additions & 15 deletions docs/dev/shims.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ Spark 3.0.2's URLs:

```text
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark3xx-common/
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark-shared/
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark302/
```

Spark 3.2.0's URLs :

```text
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark3xx-common/
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark-shared/
jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark320/
```

Expand Down Expand Up @@ -143,7 +143,7 @@ This has two pre-requisites:

1. The .class file with the bytecode is bitwise-identical among the currently
supported Spark versions. To verify this you can inspect the dist jar and check
if the class file is under `spark3xx-common` jar entry. If this is not the case then
if the class file is under `spark-shared` jar entry. If this is not the case then
code should be refactored until all discrepancies are shimmed away.
1. The transitive closure of the classes compile-time-referenced by `A` should
have the property above.
Expand Down Expand Up @@ -181,28 +181,28 @@ mv org com ai public/
and you will see the dependencies of `public` classes. By design `public` classes
should have only edges only to other `public` classes in the dist jar.

Execute `jdeps` against `public`, `spark3xx-common` and an *exactly one* parallel
Execute `jdeps` against `public`, `spark-shared` and an *exactly one* parallel
world such as `spark330`

```bash
${JAVA_HOME}/bin/jdeps -v \
-dotoutput /tmp/jdeps330 \
-regex '(com|org)\..*\.rapids\..*' \
public spark3xx-common spark330
public spark-shared spark330
```

This will produce three DOT files for each "archive" with directed edges for
a class in the archive to a class either in this or another archive.

Looking at an output file, e.g. `/tmp/jdeps330/spark3xx-common.dot`,
Looking at an output file, e.g. `/tmp/jdeps330/spark-shared.dot`,
unfortunately you see that jdeps does not label the source class node but labels
the target class node of an edge. Thus the graph is incorrect as it breaks paths
if a node has both incoming and outgoing edges.

```bash
$ grep 'com.nvidia.spark.rapids.GpuFilterExec\$' spark3xx-common.dot
$ grep 'com.nvidia.spark.rapids.GpuFilterExec\$' spark-shared.dot
"com.nvidia.spark.rapids.GpuFilterExec$" -> "com.nvidia.spark.rapids.GpuFilterExec (spark330)";
"com.nvidia.spark.rapids.GpuOverrides$$anon$204" -> "com.nvidia.spark.rapids.GpuFilterExec$ (spark3xx-common)";
"com.nvidia.spark.rapids.GpuOverrides$$anon$204" -> "com.nvidia.spark.rapids.GpuFilterExec$ (spark-shared)";
```

So first create and `cd` to some other directory `/tmp/jdep330.processed` to massage
Expand All @@ -214,8 +214,8 @@ that the source nodes are guaranteed to be from the `<archive>`.
```bash
sed 's/"\([^(]*\)"\(\s*->.*;\)/"\1 (public)"\2/' \
/tmp/jdeps330/public.dot > public.dot
sed 's/"\([^(]*\)"\(\s*->.*;\)/"\1 (spark3xx-common)"\2/' \
/tmp/jdeps330/spark3xx-common.dot > spark3xx-common.dot
sed 's/"\([^(]*\)"\(\s*->.*;\)/"\1 (spark-shared)"\2/' \
/tmp/jdeps330/spark-shared.dot > spark-shared.dot
sed 's/"\([^(]*\)"\(\s*->.*;\)/"\1 (spark330)"\2/' \
/tmp/jdeps330/spark330.dot > spark330.dot
```
Expand All @@ -224,7 +224,7 @@ Next you need to union edges of all three graphs into a single graph to be able
to analyze cross-archive paths.

```bash
cat public.dot spark3xx-common.dot spark330.dot | \
cat public.dot spark-shared.dot spark330.dot | \
tr '\n' '\r' | \
sed 's/}\rdigraph "[^"]*" {\r//g' | \
tr '\r' '\n' > merged.dot
Expand All @@ -245,7 +245,7 @@ GpuTypeColumnVector needs refactoring prior externalization as of the time
of this writing:

```bash
$ dijkstra -d -p "com.nvidia.spark.rapids.GpuColumnVector (spark3xx-common)" merged.dot | \
$ dijkstra -d -p "com.nvidia.spark.rapids.GpuColumnVector (spark-shared)" merged.dot | \
grep '\[dist=' | grep '(spark330)'
"org.apache.spark.sql.rapids.GpuFileSourceScanExec (spark330)" [dist=5.000,
"com.nvidia.spark.rapids.GpuExec (spark330)" [dist=3.000,
Expand All @@ -255,9 +255,9 @@ $ dijkstra -d -p "com.nvidia.spark.rapids.GpuColumnVector (spark3xx-common)" mer
RegexReplace could be externalized safely:

```bash
$ dijkstra -d -p "org.apache.spark.sql.rapids.RegexReplace (spark3xx-common)" merged.dot | grep '\[dist='
"org.apache.spark.sql.rapids.RegexReplace (spark3xx-common)" [dist=0.000];
"org.apache.spark.sql.rapids.RegexReplace$ (spark3xx-common)" [dist=1.000,
$ dijkstra -d -p "org.apache.spark.sql.rapids.RegexReplace (spark-shared)" merged.dot | grep '\[dist='
"org.apache.spark.sql.rapids.RegexReplace (spark-shared)" [dist=0.000];
"org.apache.spark.sql.rapids.RegexReplace$ (spark-shared)" [dist=1.000,
```

because it is self-contained.
Expand Down
6 changes: 3 additions & 3 deletions jenkins/spark-premerge-build.sh
jlowe marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -88,11 +88,11 @@ mvn_verify() {
FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
UDF_JAR=$(ls ./udf-compiler/target/spark${SPK_VER}/rapids-4-spark-udf_2.12-*-spark${SPK_VER}.jar | grep -v test | xargs readlink -f)
pushd target/jacoco_classes/
jar xf $FILE com org rapids spark3xx-common "spark${JACOCO_SPARK_VER:-311}/"
jar xf $FILE com org rapids spark-shared "spark${JACOCO_SPARK_VER:-311}/"
# extract the .class files in udf jar and replace the existing ones in spark3xx-ommon and spark$SPK_VER
# because the class files in udf jar will be modified in aggregator's shade phase
jar xf "$UDF_JAR" com/nvidia/spark/udf
rm -rf com/nvidia/shaded/ org/openucx/ spark3xx-common/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/
rm -rf com/nvidia/shaded/ org/openucx/ spark-shared/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/
popd

# Triggering here until we change the jenkins file
Expand Down
4 changes: 4 additions & 0 deletions scala2.13/delta-lake/delta-spark341db/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
Expand Down
4 changes: 0 additions & 4 deletions scala2.13/sql-plugin/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,6 @@
<groupId>org.alluxio</groupId>
<artifactId>alluxio-shaded-client</artifactId>
</dependency>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
Expand Down
4 changes: 0 additions & 4 deletions sql-plugin/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,6 @@
<groupId>org.alluxio</groupId>
<artifactId>alluxio-shaded-client</artifactId>
</dependency>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
Expand Down
Loading