Skip to content

Commit

Permalink
Merge remote-tracking branch 'refs/remotes/origin-github/branch-22.02…
Browse files Browse the repository at this point in the history
…' into parq_dec128
  • Loading branch information
kuhushukla committed Dec 15, 2021
2 parents 721820e + 3c8e5df commit 1c010cf
Show file tree
Hide file tree
Showing 110 changed files with 5,559 additions and 2,793 deletions.
280 changes: 278 additions & 2 deletions CHANGELOG.md

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,25 @@
<pattern>org.apache.hadoop.hive.</pattern>
<shadedPattern>${rapids.shade.package}.hadoop.hive.</shadedPattern>
<excludes>
<!--
Class exclusions for Hive UDFs, to avoid the ClassNotFoundException,
For example:
E Caused by: java.lang.ClassNotFoundException: com.nvidia.shaded.spark.hadoop.hive.serde2.objectinspector.ObjectInspector
E at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
-->
<exclude>org.apache.hadoop.hive.conf.HiveConf</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.FunctionRegistry</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.UDF</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.UDFMethodResolver</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.UDFType</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF$DeferredObject</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils$ConversionHelper</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory$ObjectInspectorOptions</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.StructField</exclude>
<exclude>org.apache.hadoop.hive.serde2.typeinfo.TypeInfo</exclude>
</excludes>
</relocation>
<relocation>
Expand Down
11 changes: 8 additions & 3 deletions build/coverage-report
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,21 @@ TMP_CLASS=${TEMP_CLASS_LOC:-"./target/jacoco_classes/"}
HTML_LOC=${HTML_LOCATION:="./target/jacoco-report/"}
XML_LOC=${XML_LOCATION:="${HTML_LOC}"}
DIST_JAR=${RAPIDS_DIST_JAR:-$(ls ./dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)}
SOURCE_DIRS=${SOURCE_DIRS:-"./sql-plugin/src/main/scala/:./sql-plugin/src/main/java/:./shuffle-plugin/src/main/scala/"}
SPK_VER=${JACOCO_SPARK_VER:-"301"}
UDF_JAR=${RAPIDS_UDF_JAR:-$(ls ./udf-compiler/target/spark${SPK_VER}/rapids-4-spark-udf_2.12-*-SNAPSHOT-spark${SPK_VER}.jar | grep -v test | xargs readlink -f)}
SOURCE_DIRS=${SOURCE_DIRS:-"./sql-plugin/src/main/scala/:./sql-plugin/src/main/java/:./shuffle-plugin/src/main/scala/:./udf-compiler/src/main/scala/"}

SOURCE_WITH_ARGS="--sourcefiles "$(echo $SOURCE_DIRS | sed -e 's/:/ --sourcefiles /g')

# Clean up the classes so we can build the report cleanly
rm -rf "$TMP_CLASS"
mkdir -p "$TMP_CLASS"
pushd "$TMP_CLASS"
jar xf "$DIST_JAR"
rm -rf com/nvidia/shaded/ org/openucx/
jar xf "$DIST_JAR" com org rapids spark3xx-common "spark${SPK_VER}/"
# extract the .class files in udf jar and replace the existing ones in spark3xx-ommon and spark$SPK_VER
# because the class files in udf jar will be modified in aggregator's shade phase
jar xf "$UDF_JAR" com/nvidia/spark/udf
rm -rf com/nvidia/shaded/ org/openucx/ spark3xx-common/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/
popd

if [ ! -f "$JDEST" ]; then
Expand Down
34 changes: 33 additions & 1 deletion dist/maven-antrun/build-parallel-worlds.xml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,39 @@
<concat destfile="${shimServiceFile}">
<fileset dir="${project.build.directory}/parallel-world" includes="spark*/${shimServiceRsrc}"/>
</concat>

<!-- check shims revisions -->
<exec executable="${project.basedir}/scripts/check-shims-revisions.sh" failonerror="true"
dir="${project.build.directory}">
<arg value="${included_buildvers}"/>
</exec>

<exec executable="${project.basedir}/scripts/binary-dedupe.sh" failonerror="true"
dir="${project.build.directory}"/>


<echo level="info">Generating dependency-reduced-pom.xml</echo>
<resources id="aggregatorDependencyRegexWithoutWhitespace">
<string>&lt;dependency&gt;</string>
<string>&lt;groupId&gt;com.nvidia&lt;/groupId&gt;</string>
<string>&lt;artifactId&gt;rapids-4-spark-aggregator_\S+?&lt;/artifactId&gt;</string>
<string>&lt;version&gt;\S+?&lt;/version&gt;</string>
<string>&lt;classifier&gt;\S+?&lt;/classifier&gt;</string>
<string>&lt;scope&gt;\S+?&lt;/scope&gt;</string>
<string>&lt;/dependency&gt;</string>
</resources>
<pathconvert property="aggregatorDependencyRegex" refid="aggregatorDependencyRegexWithoutWhitespace" pathsep="\s+?"/>

<echo level="info">Generated regex to remove aggregator dependencies:
${aggregatorDependencyRegex}
</echo>
<copy file="${project.basedir}/pom.xml"
tofile="${project.build.directory}/extra-resources/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml"
overwrite="true">
<filterchain>
<replaceregex flags="gs" byline="false" replace=""
pattern="${aggregatorDependencyRegex}"/>
</filterchain>
</copy>
</target>
</project>
</project>
23 changes: 18 additions & 5 deletions dist/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,7 @@
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
<!--
provided such that the 3rd party project depending on this will drop it
https://maven.apache.org/guides/introduction/introduction-to-dependency-mechanism.html#Dependency_Scope
-->
<scope>provided</scope>
<scope>compile</scope>
</dependency>

<!--
Expand Down Expand Up @@ -248,6 +244,10 @@
</goals>
<configuration>
<classesDirectory>${project.build.directory}/parallel-world</classesDirectory>
<excludes>
<!-- get rid of all maven poms from shim builds -->
<exclude>META-INF/maven/**</exclude>
</excludes>
</configuration>
</execution>
<execution>
Expand Down Expand Up @@ -299,6 +299,19 @@
</target>
</configuration>
</execution>
<execution>
<phase>verify</phase>
<goals>
<goal>run</goal>
</goals>
<id>reduce-pom-deps-in-the-jar</id>
<configuration>
<target>
<zip update="true" basedir="${project.build.directory}/extra-resources"
destfile="${project.build.directory}/${project.artifactId}-${project.version}.jar"/>
</target>
</configuration>
</execution>
<execution>
<id>update_config_docs</id>
<phase>verify</phase>
Expand Down
52 changes: 52 additions & 0 deletions dist/scripts/check-shims-revisions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash

#
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -e
#
# check the revisions of each shims, they should be equal
# $1 is included_buildvers
function check-shims-revisions() {
included_buildvers="$1"
# PWD should be spark-rapids root path
parallel_dir=${PWD}/parallel-world
pre_revision=""
pre_shim_version_path=""

IFS=","
for shim in ${included_buildvers}; do
# trim
shim=$(echo "${shim}" | xargs)
shim_version_path="${parallel_dir}/spark${shim}/rapids4spark-version-info.properties"
if [[ -f "$shim_version_path" ]] ; then
curr_revision=$(grep "revision=" "${shim_version_path}" | cut -d'=' -f2)
if [ -n "$pre_revision" ] && [[ "$curr_revision" != "$pre_revision" ]] ; then
echo
echo "Check Failed: git revisions between shims are not equal"
echo "Please check the revisions of each shim to see which one is inconsistent. Note, if building with Databricks those jars are built separately."
exit 1
fi
pre_revision="${curr_revision}"
pre_shim_version_path="${shim_version_path}"
else
echo "Error: version file missing: ${shim_version_path}"
exit 1
fi
done
}

check-shims-revisions "$1"
96 changes: 33 additions & 63 deletions docs/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,63 +109,31 @@ a few operations that we cannot support to the same degree as Spark can on the C

### Decimal Sum Aggregation

When Apache Spark does a sum aggregation on decimal values it will store the result in a value
with a precision that is the input precision + 10, but with a maximum precision of 38. The table
below shows the number of rows/values in an aggregation before an overflow is possible,
and the number of rows/values in the aggregation before an overflow might not be detected.
The numbers are for Spark 3.1.0 and above after a number of fixes were put in place, please see
A number of fixes for overflow detection went into Spark 3.1.0. Please see
[SPARK-28067](https://issues.apache.org/jira/browse/SPARK-28067) and
[SPARK-32018](https://issues.apache.org/jira/browse/SPARK-32018) for more information.
Please also note that these are for the worst case situations, meaning all the values in the sum
were either the largest or smallest values possible to be stored in the input type. In the common
case, where the numbers are smaller, or vary between positive and negative values, many more
rows/values can be processed without any issues.

|Input Precision|Number of values before overflow is possible|Maximum number of values for guaranteed overflow detection (Spark CPU)|Maximum number of values for guaranteed overflow detection (RAPIDS GPU)|
|---------------|------------------------------|------------|-------------|
|1 |11,111,111,111 |2,049,638,219,301,061,290 |Same as CPU |
|2 |10,101,010,101 |186,330,738,118,278,299 |Same as CPU |
|3 |10,010,010,010 |18,465,199,272,982,534 |Same as CPU |
|4 |10,001,000,100 |1,844,848,892,260,181 |Same as CPU |
|5 |10,000,100,001 |184,459,285,329,948 |Same as CPU |
|6 |10,000,010,000 |18,436,762,510,472 |Same as CPU |
|7 |10,000,001,000 |1,834,674,590,838 |Same as CPU |
|8 |10,000,000,100 |174,467,442,481 |Same as CPU |
|9 |10,000,000,010 |Unlimited |Unlimited |
|10 - 19 |10,000,000,000 |Unlimited |Unlimited |
|20 |10,000,000,000 |Unlimited |3,402,823,659,209,384,634 |
|21 |10,000,000,000 |Unlimited |340,282,356,920,938,463 |
|22 |10,000,000,000 |Unlimited |34,028,226,692,093,846 |
|23 |10,000,000,000 |Unlimited |3,402,813,669,209,384 |
|24 |10,000,000,000 |Unlimited |340,272,366,920,938 |
|25 |10,000,000,000 |Unlimited |34,018,236,692,093 |
|26 |10,000,000,000 |Unlimited |3,392,823,669,209 |
|27 |10,000,000,000 |Unlimited |330,282,366,920 |
|28 |10,000,000,000 |Unlimited |24,028,236,692 |
|29 |1,000,000,000 |Unlimited |Falls back to CPU |
|30 |100,000,000 |Unlimited |Falls back to CPU |
|31 |10,000,000 |Unlimited |Falls back to CPU |
|32 |1,000,000 |Unlimited |Falls back to CPU |
|33 |100,000 |Unlimited |Falls back to CPU |
|34 |10,00 |Unlimited |Falls back to CPU |
|35 |1,000 |Unlimited |Falls back to CPU |
|36 |100 |Unlimited |Falls back to CPU |
|37 |10 |Unlimited |Falls back to CPU |
|38 |1 |Unlimited |Falls back to CPU |

For an input precision of 9 and above, Spark will do the aggregations as a `BigDecimal`
value which is slow, but guarantees that any overflow can be detected. For inputs with a
precision of 8 or below Spark will internally do the calculations as a long value, 64-bits.
When the precision is 8, you would need at least 174-billion values/rows contributing to a
single aggregation result, and even then all the values would need to be either the largest
or the smallest value possible to be stored in the type before the overflow is no longer detected.

For the RAPIDS Accelerator we only have access to at most a 128-bit value to store the results
in and still detect overflow. Because of this we cannot guarantee overflow detection in all
cases. In some cases we can guarantee unlimited overflow detection because of the maximum number of
values that RAPIDS will aggregate in a single batch. But even in the worst cast for a decimal value
with a precision of 28 the user would still have to aggregate so many values that it overflows 2.4
times over before we are no longer able to detect it.
[SPARK-32018](https://issues.apache.org/jira/browse/SPARK-32018) for more detailed information.
Some of these fixes we were able to back port, but some of them require Spark 3.1.0 or above to
fully be able to detect overflow in all cases. As such on versions of Spark older than 3.1.0 for
large decimal values there is the possibility of data corruption in some corner cases.
This is true for both the CPU and GPU implementations, but there are fewer of these cases for the
GPU. If this concerns you, you should upgrade to Spark 3.1.0 or above.

When Apache Spark does a sum aggregation on decimal values it will store the result in a value
with a precision that is the input precision + 10, but with a maximum precision of 38.
For an input precision of 9 and above, Spark will do the aggregations as a Java `BigDecimal`
value which is slow, but guarantees that any overflow can be detected because it can work with
effectively unlimited precision. For inputs with a precision of 8 or below Spark will internally do
the calculations as a long value, 64-bits. When the precision is 8, you would need at least
174,467,442,482 values/rows contributing to a single aggregation result before the overflow is no
longer detected. Even then all the values would need to be either the largest or the smallest value
possible to be stored in the type for the overflow to cause data corruption.

For the RAPIDS Accelerator we don't have direct access to unlimited precision for our calculations
like the CPU does. For input values with a precision of 8 and below we follow Spark and process the
data the same way, as a 64-bit value. For larger values we will do extra calculations looking at the
higher order digits to be able to detect overflow in all cases. But because of this you may see
some performance differences depending on the input precision used. The differences will show up
when going from an input precision of 8 to 9 and again when going from an input precision of 28 to 29.

### Decimal Average

Expand All @@ -175,8 +143,7 @@ have. It also inherits some issues from Spark itself. See
https://issues.apache.org/jira/browse/SPARK-37024 for a detailed description of some issues
with average in Spark.

In order to be able to guarantee overflow detection on the sum with at least 100-billion values
and to be able to guarantee doing the divide with half up rounding at the end we only support
In order to be able to guarantee doing the divide with half up rounding at the end we only support
average on input values with a precision of 23 or below. This is 38 - 10 for the sum guarantees
and then 5 less to be able to shift the left-hand side of the divide enough to get a correct
answer that can be rounded to the result that Spark would produce.
Expand Down Expand Up @@ -479,6 +446,7 @@ The following Apache Spark regular expression functions and expressions are supp

- `RLIKE`
- `regexp`
- `regexp_extract`
- `regexp_like`
- `regexp_replace`

Expand All @@ -490,6 +458,7 @@ These operations can be enabled on the GPU with the following configuration sett

- `spark.rapids.sql.expression.RLike=true` (for `RLIKE`, `regexp`, and `regexp_like`)
- `spark.rapids.sql.expression.RegExpReplace=true` for `regexp_replace`
- `spark.rapids.sql.expression.RegExpExtract=true` for `regexp_extract`

Even when these expressions are enabled, there are instances where regular expression operations will fall back to
CPU when the RAPIDS Accelerator determines that a pattern is either unsupported or would produce incorrect results on the GPU.
Expand All @@ -504,12 +473,12 @@ Here are some examples of regular expression patterns that are not supported on
- Empty groups: `()`
- Regular expressions containing null characters (unless the pattern is a simple literal string)
- Beginning-of-line and end-of-line anchors (`^` and `$`) are not supported in some contexts, such as when combined
- with a choice (`^|a`) or when used anywhere in `regexp_replace` patterns.
with a choice (`^|a`) or when used anywhere in `regexp_replace` patterns.

In addition to these cases that can be detected, there is also one known issue that can cause incorrect results:
In addition to these cases that can be detected, there are also known issues that can cause incorrect results:

- `$` does not match the end of a string if the string ends with a line-terminator
([cuDF issue #9620](https://github.com/rapidsai/cudf/issues/9620))
- Character classes for negative matches have different behavior between CPU and GPU for multiline
strings. The pattern `[^a]` will match line-terminators on CPU but not on GPU.

Work is ongoing to increase the range of regular expressions that can run on the GPU.

Expand Down Expand Up @@ -725,7 +694,8 @@ This configuration setting is ignored when using Spark versions prior to 3.1.0.
### Float to String

The GPU will use different precision than Java's toString method when converting floating-point data
types to strings and this can produce results that differ from the default behavior in Spark.
types to strings. The GPU uses a lowercase `e` prefix for an exponent while Spark uses uppercase
`E`. As a result the computed string can differ from the default behavior in Spark.

To enable this operation on the GPU, set
[`spark.rapids.sql.castFloatToString.enabled`](configs.md#sql.castFloatToString.enabled) to `true`.
Expand Down
Loading

0 comments on commit 1c010cf

Please sign in to comment.