Merge remote-tracking branch 'refs/remotes/origin-github/branch-22.02…

…' into parq_dec128
NVIDIA · Dec 15, 2021 · 1c010cf · 1c010cf
2 parents 721820e + 3c8e5df
commit 1c010cf
Show file tree

Hide file tree

Showing 110 changed files with 5,559 additions and 2,793 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
@@ -100,9 +100,25 @@
                             <pattern>org.apache.hadoop.hive.</pattern>
                             <shadedPattern>${rapids.shade.package}.hadoop.hive.</shadedPattern>
                             <excludes>
+                                <!--
+                                    Class exclusions for Hive UDFs, to avoid the ClassNotFoundException,
+                                    For example:
+                                        E Caused by: java.lang.ClassNotFoundException: com.nvidia.shaded.spark.hadoop.hive.serde2.objectinspector.ObjectInspector
+                                        E     at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
+                                -->
                                 <exclude>org.apache.hadoop.hive.conf.HiveConf</exclude>
+                                <exclude>org.apache.hadoop.hive.ql.exec.FunctionRegistry</exclude>
                                 <exclude>org.apache.hadoop.hive.ql.exec.UDF</exclude>
+                                <exclude>org.apache.hadoop.hive.ql.exec.UDFMethodResolver</exclude>
+                                <exclude>org.apache.hadoop.hive.ql.udf.UDFType</exclude>
                                 <exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF</exclude>
+                                <exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF$DeferredObject</exclude>
+                                <exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils$ConversionHelper</exclude>
+                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector</exclude>
+                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory</exclude>
+                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory$ObjectInspectorOptions</exclude>
+                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.StructField</exclude>
+                                <exclude>org.apache.hadoop.hive.serde2.typeinfo.TypeInfo</exclude>
                             </excludes>
                         </relocation>
                         <relocation>

diff --git a/build/coverage-report b/build/coverage-report
@@ -23,16 +23,21 @@ TMP_CLASS=${TEMP_CLASS_LOC:-"./target/jacoco_classes/"}
 HTML_LOC=${HTML_LOCATION:="./target/jacoco-report/"}
 XML_LOC=${XML_LOCATION:="${HTML_LOC}"}
 DIST_JAR=${RAPIDS_DIST_JAR:-$(ls ./dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)}
-SOURCE_DIRS=${SOURCE_DIRS:-"./sql-plugin/src/main/scala/:./sql-plugin/src/main/java/:./shuffle-plugin/src/main/scala/"}
+SPK_VER=${JACOCO_SPARK_VER:-"301"}
+UDF_JAR=${RAPIDS_UDF_JAR:-$(ls ./udf-compiler/target/spark${SPK_VER}/rapids-4-spark-udf_2.12-*-SNAPSHOT-spark${SPK_VER}.jar | grep -v test | xargs readlink -f)}
+SOURCE_DIRS=${SOURCE_DIRS:-"./sql-plugin/src/main/scala/:./sql-plugin/src/main/java/:./shuffle-plugin/src/main/scala/:./udf-compiler/src/main/scala/"}
 
 SOURCE_WITH_ARGS="--sourcefiles "$(echo $SOURCE_DIRS | sed -e 's/:/ --sourcefiles /g')
 
 # Clean up the classes so we can build the report cleanly
 rm -rf "$TMP_CLASS"
 mkdir -p "$TMP_CLASS"
 pushd "$TMP_CLASS"
-jar xf "$DIST_JAR"
-rm -rf com/nvidia/shaded/ org/openucx/
+jar xf "$DIST_JAR" com org rapids spark3xx-common "spark${SPK_VER}/"
+# extract the .class files in udf jar and replace the existing ones in spark3xx-ommon and spark$SPK_VER
+# because the class files in udf jar will be modified in aggregator's shade phase
+jar xf "$UDF_JAR" com/nvidia/spark/udf
+rm -rf com/nvidia/shaded/ org/openucx/ spark3xx-common/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/
 popd
 
 if [ ! -f "$JDEST" ]; then

diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml
@@ -121,7 +121,39 @@
         <concat destfile="${shimServiceFile}">
             <fileset dir="${project.build.directory}/parallel-world" includes="spark*/${shimServiceRsrc}"/>
         </concat>
+
+        <!-- check shims revisions -->
+        <exec executable="${project.basedir}/scripts/check-shims-revisions.sh" failonerror="true"
+              dir="${project.build.directory}">
+            <arg value="${included_buildvers}"/>
+        </exec>
+
         <exec executable="${project.basedir}/scripts/binary-dedupe.sh" failonerror="true"
             dir="${project.build.directory}"/>
+
+
+        <echo level="info">Generating dependency-reduced-pom.xml</echo>
+        <resources id="aggregatorDependencyRegexWithoutWhitespace">
+            <string>&lt;dependency&gt;</string>
+            <string>&lt;groupId&gt;com.nvidia&lt;/groupId&gt;</string>
+            <string>&lt;artifactId&gt;rapids-4-spark-aggregator_\S+?&lt;/artifactId&gt;</string>
+            <string>&lt;version&gt;\S+?&lt;/version&gt;</string>
+            <string>&lt;classifier&gt;\S+?&lt;/classifier&gt;</string>
+            <string>&lt;scope&gt;\S+?&lt;/scope&gt;</string>
+            <string>&lt;/dependency&gt;</string>
+        </resources>
+        <pathconvert property="aggregatorDependencyRegex" refid="aggregatorDependencyRegexWithoutWhitespace" pathsep="\s+?"/>
+
+        <echo level="info">Generated regex to remove aggregator dependencies:
+        ${aggregatorDependencyRegex}
+        </echo>
+        <copy file="${project.basedir}/pom.xml"
+            tofile="${project.build.directory}/extra-resources/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml"
+            overwrite="true">
+            <filterchain>
+                <replaceregex flags="gs" byline="false" replace=""
+                              pattern="${aggregatorDependencyRegex}"/>
+            </filterchain>
+        </copy>
     </target>
-</project>
+</project>
diff --git a/dist/pom.xml b/dist/pom.xml
@@ -34,11 +34,7 @@
             <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
             <version>${project.version}</version>
             <classifier>${spark.version.classifier}</classifier>
-            <!--
-                provided such that the 3rd party project depending on this will drop it
-                https://maven.apache.org/guides/introduction/introduction-to-dependency-mechanism.html#Dependency_Scope
-            -->
-            <scope>provided</scope>
+            <scope>compile</scope>
         </dependency>
 
         <!--
@@ -248,6 +244,10 @@
                         </goals>
                         <configuration>
                             <classesDirectory>${project.build.directory}/parallel-world</classesDirectory>
+                            <excludes>
+                                <!-- get rid of all maven poms from shim builds -->
+                                <exclude>META-INF/maven/**</exclude>
+                            </excludes>
                         </configuration>
                     </execution>
                     <execution>
@@ -299,6 +299,19 @@
                             </target>
                         </configuration>
                     </execution>
+                    <execution>
+                        <phase>verify</phase>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                        <id>reduce-pom-deps-in-the-jar</id>
+                        <configuration>
+                            <target>
+                                <zip update="true" basedir="${project.build.directory}/extra-resources"
+                                    destfile="${project.build.directory}/${project.artifactId}-${project.version}.jar"/>
+                            </target>
+                        </configuration>
+                    </execution>
                     <execution>
                         <id>update_config_docs</id>
                         <phase>verify</phase>

diff --git a/dist/scripts/check-shims-revisions.sh b/dist/scripts/check-shims-revisions.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+#
+# check the revisions of each shims, they should be equal
+# $1 is included_buildvers
+function check-shims-revisions() {
+  included_buildvers="$1"
+  # PWD should be spark-rapids root path
+  parallel_dir=${PWD}/parallel-world
+  pre_revision=""
+  pre_shim_version_path=""
+
+  IFS=","
+  for shim in ${included_buildvers}; do
+    # trim
+    shim=$(echo "${shim}" | xargs)
+    shim_version_path="${parallel_dir}/spark${shim}/rapids4spark-version-info.properties"
+    if [[ -f "$shim_version_path" ]] ; then
+      curr_revision=$(grep "revision=" "${shim_version_path}" | cut -d'=' -f2)
+      if [ -n "$pre_revision" ] && [[ "$curr_revision" != "$pre_revision" ]] ; then
+        echo
+        echo "Check Failed: git revisions between shims are not equal"
+        echo "Please check the revisions of each shim to see which one is inconsistent. Note, if building with Databricks those jars are built separately."
+        exit 1
+      fi
+      pre_revision="${curr_revision}"
+      pre_shim_version_path="${shim_version_path}"
+    else
+      echo "Error: version file missing: ${shim_version_path}"
+      exit 1
+    fi
+  done
+}
+
+check-shims-revisions "$1"
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -109,63 +109,31 @@ a few operations that we cannot support to the same degree as Spark can on the C
 
 ### Decimal Sum Aggregation
 
-When Apache Spark does a sum aggregation on decimal values it will store the result in a value
-with a precision that is the input precision + 10, but with a maximum precision of 38. The table
-below shows the number of rows/values in an aggregation before an overflow is possible,
-and the number of rows/values in the aggregation before an overflow might not be detected.
-The numbers are for Spark 3.1.0 and above after a number of fixes were put in place, please see
+A number of fixes for overflow detection went into Spark 3.1.0. Please see
 [SPARK-28067](https://issues.apache.org/jira/browse/SPARK-28067) and
-[SPARK-32018](https://issues.apache.org/jira/browse/SPARK-32018) for more information. 
-Please also note that these are for the worst case situations, meaning all the values in the sum 
-were either the largest or smallest values possible to be stored in the input type. In the common
-case, where the numbers are smaller, or vary between positive and negative values, many more
-rows/values can be processed without any issues.
-
-|Input Precision|Number of values before overflow is possible|Maximum number of values for guaranteed overflow detection (Spark CPU)|Maximum number of values for guaranteed overflow detection (RAPIDS GPU)|
-|---------------|------------------------------|------------|-------------|
-|1              |11,111,111,111                |2,049,638,219,301,061,290 |Same as CPU |
-|2              |10,101,010,101                |186,330,738,118,278,299 |Same as CPU |
-|3              |10,010,010,010                |18,465,199,272,982,534 |Same as CPU |
-|4              |10,001,000,100                |1,844,848,892,260,181 |Same as CPU |
-|5              |10,000,100,001                |184,459,285,329,948 |Same as CPU |
-|6              |10,000,010,000                |18,436,762,510,472 |Same as CPU |
-|7              |10,000,001,000                |1,834,674,590,838 |Same as CPU |
-|8              |10,000,000,100                |174,467,442,481 |Same as CPU |
-|9              |10,000,000,010                |Unlimited   |Unlimited |
-|10 - 19        |10,000,000,000                |Unlimited   |Unlimited |
-|20             |10,000,000,000                |Unlimited   |3,402,823,659,209,384,634 |
-|21             |10,000,000,000                |Unlimited   |340,282,356,920,938,463 |
-|22             |10,000,000,000                |Unlimited   |34,028,226,692,093,846 |
-|23             |10,000,000,000                |Unlimited   |3,402,813,669,209,384 |
-|24             |10,000,000,000                |Unlimited   |340,272,366,920,938 |
-|25             |10,000,000,000                |Unlimited   |34,018,236,692,093 |
-|26             |10,000,000,000                |Unlimited   |3,392,823,669,209 |
-|27             |10,000,000,000                |Unlimited   |330,282,366,920 |
-|28             |10,000,000,000                |Unlimited   |24,028,236,692 |
-|29             |1,000,000,000                 |Unlimited   |Falls back to CPU |
-|30             |100,000,000                   |Unlimited   |Falls back to CPU |
-|31             |10,000,000                    |Unlimited   |Falls back to CPU |
-|32             |1,000,000                     |Unlimited   |Falls back to CPU |
-|33             |100,000                       |Unlimited   |Falls back to CPU |
-|34             |10,00                         |Unlimited   |Falls back to CPU |
-|35             |1,000                         |Unlimited   |Falls back to CPU |
-|36             |100                           |Unlimited   |Falls back to CPU |
-|37             |10                            |Unlimited   |Falls back to CPU |
-|38             |1                             |Unlimited   |Falls back to CPU |
-
-For an input precision of 9 and above, Spark will do the aggregations as a `BigDecimal` 
-value which is slow, but guarantees that any overflow can be detected. For inputs with a 
-precision of 8 or below Spark will internally do the calculations as a long value, 64-bits.
-When the precision is 8, you would need at least 174-billion values/rows contributing to a
-single aggregation result, and even then all the values would need to be either the largest
-or the smallest value possible to be stored in the type before the overflow is no longer detected.
-
-For the RAPIDS Accelerator we only have access to at most a 128-bit value to store the results
-in and still detect overflow. Because of this we cannot guarantee overflow detection in all
-cases. In some cases we can guarantee unlimited overflow detection because of the maximum number of
-values that RAPIDS will aggregate in a single batch. But even in the worst cast for a decimal value
-with a precision of 28 the user would still have to aggregate so many values that it overflows 2.4
-times over before we are no longer able to detect it.
+[SPARK-32018](https://issues.apache.org/jira/browse/SPARK-32018) for more detailed information.
+Some of these fixes we were able to back port, but some of them require Spark 3.1.0 or above to
+fully be able to detect overflow in all cases. As such on versions of Spark older than 3.1.0 for
+large decimal values there is the possibility of data corruption in some corner cases. 
+This is true for both the CPU and GPU implementations, but there are fewer of these cases for the 
+GPU. If this concerns you, you should upgrade to Spark 3.1.0 or above. 
+
+When Apache Spark does a sum aggregation on decimal values it will store the result in a value
+with a precision that is the input precision + 10, but with a maximum precision of 38.
+For an input precision of 9 and above, Spark will do the aggregations as a Java `BigDecimal`
+value which is slow, but guarantees that any overflow can be detected because it can work with
+effectively unlimited precision. For inputs with a precision of 8 or below Spark will internally do
+the calculations as a long value, 64-bits. When the precision is 8, you would need at least 
+174,467,442,482 values/rows contributing to a single aggregation result before the overflow is no
+longer detected. Even then all the values would need to be either the largest or the smallest value
+possible to be stored in the type for the overflow to cause data corruption.
+
+For the RAPIDS Accelerator we don't have direct access to unlimited precision for our calculations
+like the CPU does. For input values with a precision of 8 and below we follow Spark and process the
+data the same way, as a 64-bit value. For larger values we will do extra calculations looking at the
+higher order digits to be able to detect overflow in all cases. But because of this you may see
+some performance differences depending on the input precision used. The differences will show up
+when going from an input precision of 8 to 9 and again when going from an input precision of 28 to 29.
 
 ### Decimal Average
 
@@ -175,8 +143,7 @@ have. It also inherits some issues from Spark itself. See
 https://issues.apache.org/jira/browse/SPARK-37024 for a detailed description of some issues
 with average in Spark.
 
-In order to be able to guarantee overflow detection on the sum with at least 100-billion values
-and to be able to guarantee doing the divide with half up rounding at the end we only support
+In order to be able to guarantee doing the divide with half up rounding at the end we only support
 average on input values with a precision of 23 or below. This is 38 - 10 for the sum guarantees
 and then 5 less to be able to shift the left-hand side of the divide enough to get a correct
 answer that can be rounded to the result that Spark would produce.
@@ -479,6 +446,7 @@ The following Apache Spark regular expression functions and expressions are supp
 
 - `RLIKE`
 - `regexp`
+- `regexp_extract`
 - `regexp_like`
 - `regexp_replace`
 
@@ -490,6 +458,7 @@ These operations can be enabled on the GPU with the following configuration sett
 
 - `spark.rapids.sql.expression.RLike=true` (for `RLIKE`, `regexp`, and `regexp_like`)
 - `spark.rapids.sql.expression.RegExpReplace=true` for `regexp_replace`
+- `spark.rapids.sql.expression.RegExpExtract=true` for `regexp_extract`
 
 Even when these expressions are enabled, there are instances where regular expression operations will fall back to 
 CPU when the RAPIDS Accelerator determines that a pattern is either unsupported or would produce incorrect results on the GPU.
@@ -504,12 +473,12 @@ Here are some examples of regular expression patterns that are not supported on
 - Empty groups: `()`
 - Regular expressions containing null characters (unless the pattern is a simple literal string)
 - Beginning-of-line and end-of-line anchors (`^` and `$`) are not supported in some contexts, such as when combined 
-- with a choice (`^|a`) or when used anywhere in `regexp_replace` patterns.
+  with a choice (`^|a`) or when used anywhere in `regexp_replace` patterns.
 
-In addition to these cases that can be detected, there is also one known issue that can cause incorrect results:
+In addition to these cases that can be detected, there are also known issues that can cause incorrect results:
 
-- `$` does not match the end of a string if the string ends with a line-terminator 
-  ([cuDF issue #9620](https://github.com/rapidsai/cudf/issues/9620))
+- Character classes for negative matches have different behavior between CPU and GPU for multiline
+  strings. The pattern `[^a]` will match line-terminators on CPU but not on GPU.
 
 Work is ongoing to increase the range of regular expressions that can run on the GPU.
 
@@ -725,7 +694,8 @@ This configuration setting is ignored when using Spark versions prior to 3.1.0.
 ### Float to String
 
 The GPU will use different precision than Java's toString method when converting floating-point data
-types to strings and this can produce results that differ from the default behavior in Spark.
+types to strings. The GPU uses a lowercase `e` prefix for an exponent while Spark uses uppercase
+`E`. As a result the computed string can differ from the default behavior in Spark.
 
 To enable this operation on the GPU, set
 [`spark.rapids.sql.castFloatToString.enabled`](configs.md#sql.castFloatToString.enabled) to `true`.