Move main part of udf-examples to the external repository spark-rapid…

…s-examples Signed-off-by: Chong Gao <res_life@163.com>
NVIDIA · Jan 24, 2022 · a2ca9e0 · a2ca9e0
1 parent 4747182
commit a2ca9e0
Show file tree

Hide file tree

Showing 25 changed files with 8 additions and 1,569 deletions.
diff --git a/docs/additional-functionality/rapids-udfs.md b/docs/additional-functionality/rapids-udfs.md
@@ -154,9 +154,6 @@ decodes URL-encoded strings using the
 - [URLEncode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java)
 URL-encodes strings using the
 [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-- [CosineSimilarity](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java)
-computes the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
-between two float vectors using [native code](../../udf-examples/src/main/cpp/src)
 
 ### Hive UDF Examples
 
@@ -168,9 +165,6 @@ to decode URL-encoded strings
 implements a Hive generic UDF using the
 [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
 to URL-encode strings
-- [StringWordCount](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java)
-implements a Hive simple UDF using
-[native code](../../udf-examples/src/main/cpp/src) to count words in strings
 
 
 ## GPU Support for Pandas UDF

diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,10 +35,6 @@ def pytest_addoption(parser):
     parser.addoption(
         "--cudf_udf", action='store_true', default=False, help="if true enable cudf_udf test"
     )
-    parser.addoption(
-        "--rapids_udf_example_native", action='store_true', default=False,
-        help="if true enable tests for RAPIDS UDF examples with native code"
-    )
     parser.addoption(
         "--test_type", action='store', default="developer",
         help="the type of tests that are being run to help check all the correct tests are run - developer, pre-commit, or nightly"

diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini
@@ -1,4 +1,4 @@
-; Copyright (c) 2020-2021, NVIDIA CORPORATION.
+; Copyright (c) 2020-2022, NVIDIA CORPORATION.
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@ markers =
     limit(num_rows): Limit the number of rows that will be check in a result
     qarun: Mark qa test
     cudf_udf: Mark udf cudf test
-    rapids_udf_example_native: test UDFs that require custom cuda compilation
     validate_execs_in_gpu_plan([execs]): Exec class names to validate they exist in the GPU plan.
     shuffle_test: Mark to include test in the RAPIDS Shuffle Manager
     premerge_ci_1: Mark test that will run in first k8s pod in case of parallel build premerge job

diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -321,10 +321,3 @@ def enable_cudf_udf(request):
     if not enable_udf_cudf:
         # cudf_udf tests are not required for any test runs
         pytest.skip("cudf_udf not configured to run")
-
-@pytest.fixture(scope="session")
-def enable_rapids_udf_example_native(request):
-    native_enabled = request.config.getoption("rapids_udf_example_native")
-    if not native_enabled:
-        # udf_example_native tests are not required for any test runs
-        pytest.skip("rapids_udf_example_native is not configured to run")
diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 limit = pytest.mark.limit
 qarun = pytest.mark.qarun
 cudf_udf = pytest.mark.cudf_udf
-rapids_udf_example_native = pytest.mark.rapids_udf_example_native
 shuffle_test = pytest.mark.shuffle_test
 nightly_gpu_mem_consuming_case = pytest.mark.nightly_gpu_mem_consuming_case
 nightly_host_mem_consuming_case = pytest.mark.nightly_host_mem_consuming_case
diff --git a/integration_tests/src/main/python/rapids_udf_test.py b/integration_tests/src/main/python/rapids_udf_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql
 from data_gen import *
-from marks import rapids_udf_example_native
 from spark_session import with_spark_session
 from pyspark.sql.utils import AnalysisException
 from conftest import skip_unless_precommit_tests
@@ -58,26 +57,6 @@ def evalfn(spark):
         "hive_generic_udf_test_table",
         "SELECT urlencode(s) FROM hive_generic_udf_test_table")
 
-    def evalfn_decimal(spark):
-        load_hive_udf_or_skip_test(spark, "fraction", "com.nvidia.spark.rapids.udf.hive.DecimalFraction")
-        return gen_df(spark, [["dec", DecimalGen(38, 18)]])
-    assert_gpu_and_cpu_are_equal_sql(
-        evalfn_decimal,
-        "hive_generic_udf_test_table",
-        "SELECT fraction(dec) FROM hive_generic_udf_test_table")
-
-@rapids_udf_example_native
-def test_hive_simple_udf_native(enable_rapids_udf_example_native):
-    with_spark_session(skip_if_no_hive)
-    data_gens = [["s", StringGen('.{0,30}')]]
-    def evalfn(spark):
-        load_hive_udf_or_skip_test(spark, "wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount")
-        return gen_df(spark, data_gens)
-    assert_gpu_and_cpu_are_equal_sql(
-        evalfn,
-        "hive_native_udf_test_table",
-        "SELECT wordcount(s) FROM hive_native_udf_test_table")
-
 def load_java_udf_or_skip_test(spark, udfname, udfclass, udf_return_type=None):
     drop_udf(spark, udfname)
     try:
@@ -96,38 +75,3 @@ def evalfn(spark):
         load_java_udf_or_skip_test(spark, 'urlencode', 'com.nvidia.spark.rapids.udf.java.URLEncode')
         return unary_op_df(spark, StringGen('.{0,30}')).selectExpr("urlencode(a)")
     assert_gpu_and_cpu_are_equal_collect(evalfn)
-
-def test_java_decimal_fraction():
-    def evalfn(spark):
-        from pyspark.sql.types import DecimalType
-        load_java_udf_or_skip_test(spark, 'fraction',
-                                   'com.nvidia.spark.rapids.udf.java.DecimalFraction')
-        load_java_udf_or_skip_test(spark, 'fraction_dec64_s10',
-                                   'com.nvidia.spark.rapids.udf.java.DecimalFraction',
-                                   DecimalType(18, 10))
-        load_java_udf_or_skip_test(spark, 'fraction_dec32_s3',
-                                   'com.nvidia.spark.rapids.udf.java.DecimalFraction',
-                                   DecimalType(8, 3))
-        return three_col_df(spark, DecimalGen(38, 18), DecimalGen(18, 10), DecimalGen(8, 3)
-                            ).selectExpr("fraction(a)", "fraction_dec64_s10(b)", "fraction_dec32_s3(c)")
-    assert_gpu_and_cpu_are_equal_collect(evalfn)
-
-@rapids_udf_example_native
-def test_java_cosine_similarity_reasonable_range(enable_rapids_udf_example_native):
-    def evalfn(spark):
-        class RangeFloatGen(FloatGen):
-            def start(self, rand):
-                self._start(rand, lambda: rand.uniform(-1000.0, 1000.0))
-        load_java_udf_or_skip_test(spark, "cosine_similarity", "com.nvidia.spark.rapids.udf.java.CosineSimilarity")
-        arraygen = ArrayGen(RangeFloatGen(nullable=False, no_nans=True, special_cases=[]), min_length=8, max_length=8)
-        df = binary_op_df(spark, arraygen)
-        return df.selectExpr("cosine_similarity(a, b)")
-    assert_gpu_and_cpu_are_equal_collect(evalfn)
-
-@rapids_udf_example_native
-def test_java_cosine_similarity_with_nans(enable_rapids_udf_example_native):
-    def evalfn(spark):
-        load_java_udf_or_skip_test(spark, "cosine_similarity", "com.nvidia.spark.rapids.udf.java.CosineSimilarity")
-        arraygen = ArrayGen(FloatGen(nullable=False), min_length=8, max_length=8)
-        return binary_op_df(spark, arraygen).selectExpr("cosine_similarity(a, b)")
-    assert_gpu_and_cpu_are_equal_collect(evalfn)
diff --git a/udf-examples/README.md b/udf-examples/README.md
@@ -6,17 +6,4 @@ user-defined functions. See the
 on how RAPIDS accelerated UDFs work and guidelines for creating them.
 
 ## Building the Native Code Examples
-
-Some of the UDF examples use native code in their implementation.
-Building the native code requires a libcudf build environment, so these
-examples do not build by default. The `udf-native-examples` Maven profile
-can be used to include the native UDF examples in the build, i.e.: specify
- `-Pudf-native-examples` on the `mvn` command-line.
-
-## Creating a libcudf Build Environment
-
-The `Dockerfile` in this directory can be used to setup a Docker image that
-provides a libcudf build environment. This repository will either need to be
-cloned or mounted into a container using that Docker image.
-The `Dockerfile` contains build arguments to control the Linux version,
-CUDA version, and other settings. See the top of the `Dockerfile` for details.
+Please refer to [spark-rapids-examples](https://github.com/NVIDIA/spark-rapids-examples)
diff --git a/udf-examples/pom.xml b/udf-examples/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+  Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -32,13 +32,6 @@
   <version>22.02.0-SNAPSHOT</version>
 
   <properties>
-    <udf.native.build.path>${project.build.directory}/cpp-build</udf.native.build.path>
-    <BUILD_UDF_BENCHMARKS>OFF</BUILD_UDF_BENCHMARKS>
-    <CMAKE_CXX_FLAGS/>
-    <GPU_ARCHS>ALL</GPU_ARCHS>
-    <PER_THREAD_DEFAULT_STREAM>ON</PER_THREAD_DEFAULT_STREAM>
-    <CPP_PARALLEL_LEVEL>10</CPP_PARALLEL_LEVEL>
-    <CUDF_ENABLE_ARROW_S3>OFF</CUDF_ENABLE_ARROW_S3>
     <target.classifier/>
   </properties>
 
@@ -58,14 +51,8 @@
     </dependency>
     <dependency>
         <groupId>com.nvidia</groupId>
-        <!--
-        This should depend on rapids-4-spark_${scala.binary.version} instead, but that dependency
-        only exists after the package phase. External projects should depend on
-        rapids-4-spark_${scala.binary.version}.
-        -->
-        <artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
+        <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
         <version>${project.version}</version>
-        <classifier>${spark.version.classifier}</classifier>
         <scope>provided</scope>
     </dependency>
   </dependencies>
@@ -184,75 +171,5 @@
             </dependency>
         </dependencies>
     </profile>
-    <profile>
-      <id>udf-native-examples</id>
-      <build>
-        <resources>
-          <resource>
-            <directory>${project.build.directory}/native-deps/</directory>
-          </resource>
-        </resources>
-        <plugins>
-          <plugin>
-            <artifactId>maven-antrun-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>cmake</id>
-                <phase>validate</phase>
-                <configuration>
-                  <target>
-                    <mkdir dir="${udf.native.build.path}"/>
-                    <exec dir="${udf.native.build.path}"
-                          failonerror="true"
-                          executable="cmake">
-                      <arg value="${basedir}/src/main/cpp"/>
-                      <arg value="-DBUILD_UDF_BENCHMARKS=${BUILD_UDF_BENCHMARKS}"/>
-                      <arg value="-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"/>
-                      <arg value="-DGPU_ARCHS=${GPU_ARCHS}"/>
-                      <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}"/>
-                      <arg value="-DCUDF_ENABLE_ARROW_S3=${CUDF_ENABLE_ARROW_S3}"/>
-                    </exec>
-                    <exec failonerror="true"
-                          executable="cmake">
-                      <arg value="--build"/>
-                      <arg value="${udf.native.build.path}"/>
-                      <arg value="-j${CPP_PARALLEL_LEVEL}"/>
-                      <arg value="-v"/>
-                    </exec>
-                  </target>
-                </configuration>
-                <goals>
-                  <goal>run</goal>
-                </goals>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <artifactId>maven-resources-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>copy-native-libs</id>
-                <phase>validate</phase>
-                <goals>
-                  <goal>copy-resources</goal>
-                </goals>
-                <configuration>
-                  <overwrite>true</overwrite>
-                  <outputDirectory>${project.build.directory}/native-deps/${os.arch}/${os.name}</outputDirectory>
-                  <resources>
-                    <resource>
-                      <directory>${udf.native.build.path}</directory>
-                      <includes>
-                        <include>libudfexamplesjni.so</include>
-                      </includes>
-                    </resource>
-                  </resources>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
   </profiles>
 </project>