From 7d1662efb24888d7ef490cacbd8968f7b62e1832 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 23 Feb 2022 12:51:14 +0800 Subject: [PATCH] Move the udf-examples module to the external repository spark-rapids-examples [databricks] (#4619) * Remove the udf-examples module and will add to external spark-rapids-examples repository Signed-off-by: Chong Gao * Remove all the udf-examples related code --- docs/additional-functionality/rapids-udfs.md | 40 +-- integration_tests/README.md | 8 +- integration_tests/conftest.py | 6 +- integration_tests/pom.xml | 46 ++- integration_tests/pytest.ini | 3 +- integration_tests/run_pyspark_from_build.sh | 4 +- integration_tests/src/main/python/conftest.py | 7 - integration_tests/src/main/python/marks.py | 3 +- .../src/main/python/rapids_udf_test.py | 133 --------- .../src/main/python/row-based_udf_test.py | 20 +- .../tests}/udf/hive/EmptyHiveGenericUDF.java | 4 +- .../tests}/udf/hive/EmptyHiveSimpleUDF.java | 4 +- jenkins/databricks/build.sh | 1 - jenkins/spark-tests.sh | 7 +- pom.xml | 14 - tests/pom.xml | 6 - .../nvidia/spark/rapids/ScalaUDFSuite.scala | 4 +- .../rapids/tests}/udf/scala/URLDecode.scala | 4 +- .../rapids/tests}/udf/scala/URLEncode.scala | 4 +- udf-examples/Dockerfile | 72 ----- udf-examples/README.md | 22 -- udf-examples/pom.xml | 264 ------------------ udf-examples/src/main/cpp/CMakeLists.txt | 175 ------------ .../src/main/cpp/benchmarks/CMakeLists.txt | 36 --- .../cosine_similarity_benchmark.cpp | 91 ------ .../benchmarks/fixture/benchmark_fixture.hpp | 91 ------ .../synchronization/synchronization.cpp | 60 ---- .../synchronization/synchronization.hpp | 101 ------- .../src/main/cpp/src/CosineSimilarityJni.cpp | 94 ------- .../src/main/cpp/src/StringWordCountJni.cpp | 80 ------ .../src/main/cpp/src/cosine_similarity.cu | 145 ---------- .../src/main/cpp/src/cosine_similarity.hpp | 35 --- .../src/main/cpp/src/string_word_count.cu | 93 ------ .../src/main/cpp/src/string_word_count.hpp | 28 -- .../rapids/udf/hive/DecimalFraction.java | 101 ------- .../rapids/udf/hive/StringWordCount.java | 83 ------ .../spark/rapids/udf/hive/URLDecode.java | 74 ----- .../spark/rapids/udf/hive/URLEncode.java | 110 -------- .../rapids/udf/java/CosineSimilarity.java | 79 ------ .../rapids/udf/java/DecimalFraction.java | 60 ---- .../udf/java/NativeUDFExamplesLoader.java | 38 --- .../spark/rapids/udf/java/URLDecode.java | 74 ----- .../spark/rapids/udf/java/URLEncode.java | 67 ----- 43 files changed, 77 insertions(+), 2314 deletions(-) delete mode 100644 integration_tests/src/main/python/rapids_udf_test.py rename {udf-examples/src/main/java/com/nvidia/spark/rapids => integration_tests/src/test/java/com/nvidia/spark/rapids/tests}/udf/hive/EmptyHiveGenericUDF.java (96%) rename {udf-examples/src/main/java/com/nvidia/spark/rapids => integration_tests/src/test/java/com/nvidia/spark/rapids/tests}/udf/hive/EmptyHiveSimpleUDF.java (89%) rename {udf-examples/src/main/scala/com/nvidia/spark/rapids => tests/src/test/scala/com/nvidia/spark/rapids/tests}/udf/scala/URLDecode.scala (96%) rename {udf-examples/src/main/scala/com/nvidia/spark/rapids => tests/src/test/scala/com/nvidia/spark/rapids/tests}/udf/scala/URLEncode.scala (94%) delete mode 100644 udf-examples/Dockerfile delete mode 100644 udf-examples/README.md delete mode 100644 udf-examples/pom.xml delete mode 100755 udf-examples/src/main/cpp/CMakeLists.txt delete mode 100644 udf-examples/src/main/cpp/benchmarks/CMakeLists.txt delete mode 100644 udf-examples/src/main/cpp/benchmarks/cosine_similarity/cosine_similarity_benchmark.cpp delete mode 100644 udf-examples/src/main/cpp/benchmarks/fixture/benchmark_fixture.hpp delete mode 100644 udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.cpp delete mode 100644 udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.hpp delete mode 100644 udf-examples/src/main/cpp/src/CosineSimilarityJni.cpp delete mode 100644 udf-examples/src/main/cpp/src/StringWordCountJni.cpp delete mode 100644 udf-examples/src/main/cpp/src/cosine_similarity.cu delete mode 100644 udf-examples/src/main/cpp/src/cosine_similarity.hpp delete mode 100644 udf-examples/src/main/cpp/src/string_word_count.cu delete mode 100644 udf-examples/src/main/cpp/src/string_word_count.hpp delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/DecimalFraction.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/DecimalFraction.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/NativeUDFExamplesLoader.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java delete mode 100644 udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java diff --git a/docs/additional-functionality/rapids-udfs.md b/docs/additional-functionality/rapids-udfs.md index 13c6767f1c6..64cb92f76d7 100644 --- a/docs/additional-functionality/rapids-udfs.md +++ b/docs/additional-functionality/rapids-udfs.md @@ -134,44 +134,8 @@ type `DECIMAL64(scale=-2)`. ## RAPIDS Accelerated UDF Examples -Source code for examples of RAPIDS accelerated Hive UDFs is provided -in the [udf-examples](../../udf-examples) project. - -### Spark Scala UDF Examples - -- [URLDecode](../../udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala) -decodes URL-encoded strings using the -[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) -- [URLEncode](../../udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLEncode.scala) -URL-encodes strings using the -[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) - -### Spark Java UDF Examples - -- [URLDecode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java) -decodes URL-encoded strings using the -[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) -- [URLEncode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java) -URL-encodes strings using the -[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) -- [CosineSimilarity](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java) -computes the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) -between two float vectors using [native code](../../udf-examples/src/main/cpp/src) - -### Hive UDF Examples - -- [URLDecode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java) -implements a Hive simple UDF using the -[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) -to decode URL-encoded strings -- [URLEncode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java) -implements a Hive generic UDF using the -[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) -to URL-encode strings -- [StringWordCount](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java) -implements a Hive simple UDF using -[native code](../../udf-examples/src/main/cpp/src) to count words in strings - + +Source code for examples of RAPIDS accelerated UDFs is provided in the [udf-examples](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04/examples/RAPIDS-accelerated-UDFs) project. ## GPU Support for Pandas UDF diff --git a/integration_tests/README.md b/integration_tests/README.md index 772df557e43..a2966727dcc 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -245,7 +245,7 @@ The test files are everything under `./integration_tests/src/test/resources/` B where you placed them because you will need to tell the tests where they are. When running these tests you will need to include the test jar, the integration test jar, -the udf-examples jar, scala-test and scalactic. You can find scala-test and scalactic under +the scala-test and scalactic. You can find scala-test and scalactic under `~/.m2/repository`. It is recommended that you use `spark-shell` and the scalatest shell to run each test @@ -253,7 +253,7 @@ individually, so you don't risk running unit tests along with the integration te http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT-tests.jar,rapids-4-spark-udf-examples_2.12-22.04.0-SNAPSHOT.jar,rapids-4-spark-integration-tests_2.12-22.04.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-22.04.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you @@ -276,7 +276,7 @@ If you just want to verify the SQL replacement is working you will need to add t example assumes CUDA 11.0 is being used. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,rapids-4-spark-udf-examples_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -375,7 +375,7 @@ To run cudf_udf tests, need following configuration changes: As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,rapids-4-spark-udf-examples_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" ./runtests.py --cudf_udf ``` ## Writing tests diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py index 109e0477fdd..c453677013f 100644 --- a/integration_tests/conftest.py +++ b/integration_tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,10 +35,6 @@ def pytest_addoption(parser): parser.addoption( "--cudf_udf", action='store_true', default=False, help="if true enable cudf_udf test" ) - parser.addoption( - "--rapids_udf_example_native", action='store_true', default=False, - help="if true enable tests for RAPIDS UDF examples with native code" - ) parser.addoption( "--test_type", action='store', default="developer", help="the type of tests that are being run to help check all the correct tests are run - developer, pre-commit, or nightly" diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 75713510974..eef75b9c1bc 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -60,17 +60,16 @@ ${project.version} provided - - com.nvidia - rapids-4-spark-udf-examples_${scala.binary.version} - ${project.version} - test - org.apache.spark spark-sql_${scala.binary.version} ${spark.test.version} + + + org.apache.spark + spark-hive_${scala.binary.version} + @@ -108,6 +107,17 @@ curator-recipes 4.3.0.7.2.7.0-184 + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark311cdh.version} + + + org.apache.spark + spark-core_${scala.binary.version} + + + @@ -178,6 +188,30 @@ ${spark.version} provided + + org.apache.hive + hive-exec + ${spark.version} + provided + + + org.apache.hive + hive-serde + ${spark.version} + provided + + + org.apache.commons + commons-io + ${spark.version} + provided + + + org.apache.hadoop + hadoop-common + ${spark.version} + provided + diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini index 3b50202b1fa..53bca62da6b 100644 --- a/integration_tests/pytest.ini +++ b/integration_tests/pytest.ini @@ -1,4 +1,4 @@ -; Copyright (c) 2020-2021, NVIDIA CORPORATION. +; Copyright (c) 2020-2022, NVIDIA CORPORATION. ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ markers = limit(num_rows): Limit the number of rows that will be check in a result qarun: Mark qa test cudf_udf: Mark udf cudf test - rapids_udf_example_native: test UDFs that require custom cuda compilation validate_execs_in_gpu_plan([execs]): Exec class names to validate they exist in the GPU plan. shuffle_test: Mark to include test in the RAPIDS Shuffle Manager premerge_ci_1: Mark test that will run in first k8s pod in case of parallel build premerge job diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 5dd8947fcb9..217f2625bf4 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -42,14 +42,12 @@ else CUDF_JARS=$(echo "$LOCAL_JAR_PATH"/cudf-*.jar) PLUGIN_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark_*.jar) TEST_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark-integration-tests*-$SPARK_SHIM_VER*.jar) - UDF_EXAMPLE_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark-udf-examples*.jar) else CUDF_JARS=$(echo "$SCRIPTPATH"/target/dependency/cudf-*.jar) PLUGIN_JARS=$(echo "$SCRIPTPATH"/../dist/target/rapids-4-spark_*.jar) TEST_JARS=$(echo "$SCRIPTPATH"/target/rapids-4-spark-integration-tests*-$SPARK_SHIM_VER*.jar) - UDF_EXAMPLE_JARS=$(echo "$SCRIPTPATH"/../udf-examples/target/rapids-4-spark-udf-examples*.jar) fi - ALL_JARS="$CUDF_JARS $PLUGIN_JARS $TEST_JARS $UDF_EXAMPLE_JARS" + ALL_JARS="$CUDF_JARS $PLUGIN_JARS $TEST_JARS" echo "AND PLUGIN JARS: $ALL_JARS" if [[ "${TEST}" != "" ]]; then diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py index e398b579889..8033b886806 100644 --- a/integration_tests/src/main/python/conftest.py +++ b/integration_tests/src/main/python/conftest.py @@ -330,10 +330,3 @@ def enable_cudf_udf(request): if not enable_udf_cudf: # cudf_udf tests are not required for any test runs pytest.skip("cudf_udf not configured to run") - -@pytest.fixture(scope="session") -def enable_rapids_udf_example_native(request): - native_enabled = request.config.getoption("rapids_udf_example_native") - if not native_enabled: - # udf_example_native tests are not required for any test runs - pytest.skip("rapids_udf_example_native is not configured to run") diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py index 1db8a82f7dd..2994f052170 100644 --- a/integration_tests/src/main/python/marks.py +++ b/integration_tests/src/main/python/marks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +23,6 @@ limit = pytest.mark.limit qarun = pytest.mark.qarun cudf_udf = pytest.mark.cudf_udf -rapids_udf_example_native = pytest.mark.rapids_udf_example_native shuffle_test = pytest.mark.shuffle_test nightly_gpu_mem_consuming_case = pytest.mark.nightly_gpu_mem_consuming_case nightly_host_mem_consuming_case = pytest.mark.nightly_host_mem_consuming_case diff --git a/integration_tests/src/main/python/rapids_udf_test.py b/integration_tests/src/main/python/rapids_udf_test.py deleted file mode 100644 index 3926afe1f5d..00000000000 --- a/integration_tests/src/main/python/rapids_udf_test.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql -from data_gen import * -from marks import rapids_udf_example_native -from spark_session import with_spark_session -from pyspark.sql.utils import AnalysisException -from conftest import skip_unless_precommit_tests - -encoded_url_gen = StringGen('([^%]{0,1}(%[0-9A-F][0-9A-F]){0,1}){0,30}') - -def drop_udf(spark, udfname): - spark.sql("DROP TEMPORARY FUNCTION IF EXISTS {}".format(udfname)) - -def skip_if_no_hive(spark): - if spark.conf.get("spark.sql.catalogImplementation") != "hive": - skip_unless_precommit_tests('The Spark session does not have Hive support') - -def load_hive_udf_or_skip_test(spark, udfname, udfclass): - drop_udf(spark, udfname) - try: - spark.sql("CREATE TEMPORARY FUNCTION {} AS '{}'".format(udfname, udfclass)) - except AnalysisException: - skip_unless_precommit_tests("UDF {} failed to load, udf-examples jar is probably missing".format(udfname)) - -def test_hive_simple_udf(): - with_spark_session(skip_if_no_hive) - data_gens = [["i", int_gen], ["s", encoded_url_gen]] - def evalfn(spark): - load_hive_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.hive.URLDecode") - return gen_df(spark, data_gens) - assert_gpu_and_cpu_are_equal_sql( - evalfn, - "hive_simple_udf_test_table", - "SELECT i, urldecode(s) FROM hive_simple_udf_test_table") - -def test_hive_generic_udf(): - with_spark_session(skip_if_no_hive) - def evalfn(spark): - load_hive_udf_or_skip_test(spark, "urlencode", "com.nvidia.spark.rapids.udf.hive.URLEncode") - return gen_df(spark, [["s", StringGen('.{0,30}')]]) - assert_gpu_and_cpu_are_equal_sql( - evalfn, - "hive_generic_udf_test_table", - "SELECT urlencode(s) FROM hive_generic_udf_test_table") - - def evalfn_decimal(spark): - load_hive_udf_or_skip_test(spark, "fraction", "com.nvidia.spark.rapids.udf.hive.DecimalFraction") - return gen_df(spark, [["dec", DecimalGen(38, 18)]]) - assert_gpu_and_cpu_are_equal_sql( - evalfn_decimal, - "hive_generic_udf_test_table", - "SELECT fraction(dec) FROM hive_generic_udf_test_table") - -@rapids_udf_example_native -def test_hive_simple_udf_native(enable_rapids_udf_example_native): - with_spark_session(skip_if_no_hive) - data_gens = [["s", StringGen('.{0,30}')]] - def evalfn(spark): - load_hive_udf_or_skip_test(spark, "wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount") - return gen_df(spark, data_gens) - assert_gpu_and_cpu_are_equal_sql( - evalfn, - "hive_native_udf_test_table", - "SELECT wordcount(s) FROM hive_native_udf_test_table") - -def load_java_udf_or_skip_test(spark, udfname, udfclass, udf_return_type=None): - drop_udf(spark, udfname) - try: - spark.udf.registerJavaFunction(udfname, udfclass, udf_return_type) - except AnalysisException: - skip_unless_precommit_tests("UDF {} failed to load, udf-examples jar is probably missing".format(udfname)) - -def test_java_url_decode(): - def evalfn(spark): - load_java_udf_or_skip_test(spark, 'urldecode', 'com.nvidia.spark.rapids.udf.java.URLDecode') - return unary_op_df(spark, encoded_url_gen).selectExpr("urldecode(a)") - assert_gpu_and_cpu_are_equal_collect(evalfn) - -def test_java_url_encode(): - def evalfn(spark): - load_java_udf_or_skip_test(spark, 'urlencode', 'com.nvidia.spark.rapids.udf.java.URLEncode') - return unary_op_df(spark, StringGen('.{0,30}')).selectExpr("urlencode(a)") - assert_gpu_and_cpu_are_equal_collect(evalfn) - -def test_java_decimal_fraction(): - def evalfn(spark): - from pyspark.sql.types import DecimalType - load_java_udf_or_skip_test(spark, 'fraction', - 'com.nvidia.spark.rapids.udf.java.DecimalFraction') - load_java_udf_or_skip_test(spark, 'fraction_dec64_s10', - 'com.nvidia.spark.rapids.udf.java.DecimalFraction', - DecimalType(18, 10)) - load_java_udf_or_skip_test(spark, 'fraction_dec32_s3', - 'com.nvidia.spark.rapids.udf.java.DecimalFraction', - DecimalType(8, 3)) - return three_col_df(spark, DecimalGen(38, 18), DecimalGen(18, 10), DecimalGen(8, 3) - ).selectExpr("fraction(a)", "fraction_dec64_s10(b)", "fraction_dec32_s3(c)") - assert_gpu_and_cpu_are_equal_collect(evalfn) - -@rapids_udf_example_native -def test_java_cosine_similarity_reasonable_range(enable_rapids_udf_example_native): - def evalfn(spark): - class RangeFloatGen(FloatGen): - def start(self, rand): - self._start(rand, lambda: rand.uniform(-1000.0, 1000.0)) - load_java_udf_or_skip_test(spark, "cosine_similarity", "com.nvidia.spark.rapids.udf.java.CosineSimilarity") - arraygen = ArrayGen(RangeFloatGen(nullable=False, no_nans=True, special_cases=[]), min_length=8, max_length=8) - df = binary_op_df(spark, arraygen) - return df.selectExpr("cosine_similarity(a, b)") - assert_gpu_and_cpu_are_equal_collect(evalfn) - -@rapids_udf_example_native -def test_java_cosine_similarity_with_nans(enable_rapids_udf_example_native): - def evalfn(spark): - load_java_udf_or_skip_test(spark, "cosine_similarity", "com.nvidia.spark.rapids.udf.java.CosineSimilarity") - arraygen = ArrayGen(FloatGen(nullable=False), min_length=8, max_length=8) - return binary_op_df(spark, arraygen).selectExpr("cosine_similarity(a, b)") - assert_gpu_and_cpu_are_equal_collect(evalfn) diff --git a/integration_tests/src/main/python/row-based_udf_test.py b/integration_tests/src/main/python/row-based_udf_test.py index 5cf2cdfdd58..2b0c7e56368 100644 --- a/integration_tests/src/main/python/row-based_udf_test.py +++ b/integration_tests/src/main/python/row-based_udf_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,13 +17,25 @@ from asserts import assert_gpu_and_cpu_are_equal_sql from data_gen import * from spark_session import with_spark_session -from rapids_udf_test import skip_if_no_hive, load_hive_udf_or_skip_test +from conftest import skip_unless_precommit_tests + +def drop_udf(spark, udfname): + spark.sql("DROP TEMPORARY FUNCTION IF EXISTS {}".format(udfname)) + +def skip_if_no_hive(spark): + if spark.conf.get("spark.sql.catalogImplementation") != "hive": + skip_unless_precommit_tests('The Spark session does not have Hive support') + +def load_hive_udf(spark, udfname, udfclass): + drop_udf(spark, udfname) + # if UDF failed to load, throws AnalysisException, check if the udf class is in the class path + spark.sql("CREATE TEMPORARY FUNCTION {} AS '{}'".format(udfname, udfclass)) def test_hive_empty_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", string_gen]] def evalfn(spark): - load_hive_udf_or_skip_test(spark, "emptysimple", "com.nvidia.spark.rapids.udf.hive.EmptyHiveSimpleUDF") + load_hive_udf(spark, "emptysimple", "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveSimpleUDF") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, @@ -34,7 +46,7 @@ def evalfn(spark): def test_hive_empty_generic_udf(): with_spark_session(skip_if_no_hive) def evalfn(spark): - load_hive_udf_or_skip_test(spark, "emptygeneric", "com.nvidia.spark.rapids.udf.hive.EmptyHiveGenericUDF") + load_hive_udf(spark, "emptygeneric", "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveGenericUDF") return gen_df(spark, [["s", string_gen]]) assert_gpu_and_cpu_are_equal_sql( evalfn, diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/EmptyHiveGenericUDF.java b/integration_tests/src/test/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveGenericUDF.java similarity index 96% rename from udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/EmptyHiveGenericUDF.java rename to integration_tests/src/test/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveGenericUDF.java index b026936eb7e..fb589a35db8 100644 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/EmptyHiveGenericUDF.java +++ b/integration_tests/src/test/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveGenericUDF.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.nvidia.spark.rapids.udf.hive; +package com.nvidia.spark.rapids.tests.udf.hive; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/EmptyHiveSimpleUDF.java b/integration_tests/src/test/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveSimpleUDF.java similarity index 89% rename from udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/EmptyHiveSimpleUDF.java rename to integration_tests/src/test/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveSimpleUDF.java index f2c5a51a49f..337579c2f79 100644 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/EmptyHiveSimpleUDF.java +++ b/integration_tests/src/test/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveSimpleUDF.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.nvidia.spark.rapids.udf.hive; +package com.nvidia.spark.rapids.tests.udf.hive; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index ef8521aea95..eb37f2b990e 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -59,7 +59,6 @@ SCALA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=scala.binary.version - CUDA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=cuda.version -DforceStdout` RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar -RAPIDS_UDF_JAR=rapids-4-spark-udf-examples_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar echo "Scala version is: $SCALA_VERSION" # export 'M2DIR' so that shims can get the correct cudf/spark dependnecy info diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index a78838edcde..69e95f09330 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -32,8 +32,6 @@ $MVN_GET_CMD -DremoteRepositories=$CUDF_REPO \ -DgroupId=ai.rapids -DartifactId=cudf -Dversion=$CUDF_VER -Dclassifier=$CUDA_CLASSIFIER $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \ -DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER -$MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \ - -DgroupId=com.nvidia -DartifactId=rapids-4-spark-udf-examples_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER # TODO remove -Dtransitive=false workaround once pom is fixed $MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \ @@ -45,7 +43,6 @@ else CUDF_JAR="$ARTF_ROOT/cudf-$CUDF_VER-$CUDA_CLASSIFIER.jar" fi export RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT_VER.jar" -RAPIDS_UDF_JAR="$ARTF_ROOT/rapids-4-spark-udf-examples_${SCALA_BINARY_VER}-$PROJECT_TEST_VER.jar" RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-$SHUFFLE_SPARK_SHIM.jar" # TODO remove -Dtransitive=false workaround once pom is fixed @@ -84,15 +81,13 @@ echo "-------------------- rapids-4-spark-integration-tests BUILD INFO --------- it_ver=$(getRevision $JARS_PATH/$RAPIDS_TEST_JAR rapids4spark-version-info.properties) echo "-------------------- rapids-4-spark-integration-tests pytest BUILD INFO --------------------" >> "$tmp_info" pt_ver=$(getRevision $JARS_PATH/$RAPIDS_INT_TESTS_TGZ integration_tests/rapids4spark-version-info.properties) -echo "-------------------- rapids-4-spark-udf-examples BUILD INFO --------------------" >> "$tmp_info" -u_ver=$(getRevision $JARS_PATH/$RAPIDS_UDF_JAR rapids4spark-version-info.properties) echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" >> "$tmp_info" set -x cat "$tmp_info" || true SKIP_REVISION_CHECK=${SKIP_REVISION_CHECK:-'false'} if [[ "$SKIP_REVISION_CHECK" != "true" && (-z "$c_ver" || -z "$p_ver"|| \ - "$p_ver" != "$it_ver" || "$p_ver" != "$pt_ver" || "$p_ver" != "$u_ver") ]]; then + "$p_ver" != "$it_ver" || "$p_ver" != "$pt_ver" ]]; then echo "Artifacts revisions are inconsistent!" exit 1 fi diff --git a/pom.xml b/pom.xml index a9aedf0aef1..37f446b9582 100644 --- a/pom.xml +++ b/pom.xml @@ -137,7 +137,6 @@ sql-plugin tests udf-compiler - udf-examples api_validation tools aggregator @@ -192,7 +191,6 @@ sql-plugin tests udf-compiler - udf-examples aggregator tools api_validation @@ -246,7 +244,6 @@ sql-plugin tests udf-compiler - udf-examples api_validation tools aggregator @@ -300,7 +297,6 @@ sql-plugin tests udf-compiler - udf-examples api_validation tools aggregator @@ -358,7 +354,6 @@ sql-plugin tests udf-compiler - udf-examples api_validation tools aggregator @@ -422,7 +417,6 @@ sql-plugin tests udf-compiler - udf-examples aggregator @@ -487,7 +481,6 @@ sql-plugin tests udf-compiler - udf-examples aggregator @@ -543,7 +536,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator api_validation @@ -602,7 +594,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator api_validation @@ -667,7 +658,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator tests-spark310+ @@ -731,7 +721,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator tests-spark310+ @@ -795,7 +784,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator tests-spark310+ @@ -858,7 +846,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator tests-spark310+ @@ -920,7 +907,6 @@ sql-plugin tests udf-compiler - udf-examples tools aggregator tests-spark310+ diff --git a/tests/pom.xml b/tests/pom.xml index 3329f0dee7c..7247c7b37d4 100644 --- a/tests/pom.xml +++ b/tests/pom.xml @@ -62,12 +62,6 @@ ${spark.version.classifier} test - - com.nvidia - rapids-4-spark-udf-examples_${scala.binary.version} - ${project.version} - test - org.mockito mockito-core diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ScalaUDFSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ScalaUDFSuite.scala index 1ebb5356032..ea7fc49eaaa 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ScalaUDFSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ScalaUDFSuite.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ package com.nvidia.spark.rapids -import com.nvidia.spark.rapids.udf.scala.{URLDecode, URLEncode} +import com.nvidia.spark.rapids.tests.udf.scala.{URLDecode, URLEncode} import org.apache.spark.sql.functions.col diff --git a/udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala b/tests/src/test/scala/com/nvidia/spark/rapids/tests/udf/scala/URLDecode.scala similarity index 96% rename from udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala rename to tests/src/test/scala/com/nvidia/spark/rapids/tests/udf/scala/URLDecode.scala index e99e1bdfa2a..2e812ff99da 100644 --- a/udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/tests/udf/scala/URLDecode.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.nvidia.spark.rapids.udf.scala +package com.nvidia.spark.rapids.tests.udf.scala import java.net.URLDecoder diff --git a/udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLEncode.scala b/tests/src/test/scala/com/nvidia/spark/rapids/tests/udf/scala/URLEncode.scala similarity index 94% rename from udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLEncode.scala rename to tests/src/test/scala/com/nvidia/spark/rapids/tests/udf/scala/URLEncode.scala index 95bae373645..3c91f2e62ac 100644 --- a/udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLEncode.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/tests/udf/scala/URLEncode.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.nvidia.spark.rapids.udf.scala +package com.nvidia.spark.rapids.tests.udf.scala import java.net.URLEncoder diff --git a/udf-examples/Dockerfile b/udf-examples/Dockerfile deleted file mode 100644 index f55f1016ee2..00000000000 --- a/udf-examples/Dockerfile +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# A container that can be used to build UDF native code against libcudf -ARG CUDA_VERSION=11.0 -ARG LINUX_VERSION=ubuntu18.04 - -FROM nvidia/cuda:${CUDA_VERSION}-devel-${LINUX_VERSION} - -ARG DEBIAN_FRONTEND=noninteractive -ARG PARALLEL_LEVEL=10 -ENV PARALLEL_LEVEL=10 - -RUN GCC_VERSION=$(bash -c '\ -CUDA_VERSION=$(nvcc --version | head -n4 | tail -n1 | cut -d" " -f5 | cut -d"," -f1); \ -CUDA_VERSION_MAJOR=$(echo $CUDA_VERSION | tr -d '.' | cut -c 1-2); \ -CUDA_VERSION_MINOR=$(echo $CUDA_VERSION | tr -d '.' | cut -c 3); \ - if [[ "$CUDA_VERSION_MAJOR" == 9 ]]; then echo "7"; \ - elif [[ "$CUDA_VERSION_MAJOR" == 10 ]]; then echo "8"; \ - elif [[ "$CUDA_VERSION_MAJOR" == 11 ]]; then echo "9"; \ - else echo "10"; \ - fi') \ -&& apt update -y \ -&& apt install -y software-properties-common \ -&& add-apt-repository -y ppa:git-core/ppa \ -&& add-apt-repository -y ppa:ubuntu-toolchain-r/test \ -&& add-apt-repository ppa:deadsnakes/ppa \ -&& apt update -y \ -&& apt install -y \ - build-essential git rsync wget \ - gcc-${GCC_VERSION} g++-${GCC_VERSION} \ - openjdk-8-jdk maven tzdata \ - # CMake dependencies - curl libssl-dev libcurl4-openssl-dev zlib1g-dev \ -&& apt autoremove -y \ -&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ -&& update-alternatives \ - --install /usr/bin/gcc gcc /usr/bin/gcc-${GCC_VERSION} 100 \ -&& update-alternatives \ - --install /usr/bin/g++ g++ /usr/bin/g++-${GCC_VERSION} 100 \ -# Set gcc-${GCC_VERSION} as the default gcc -&& update-alternatives --set gcc /usr/bin/gcc-${GCC_VERSION} \ -# Set gcc-${GCC_VERSION} as the default g++ -&& update-alternatives --set g++ /usr/bin/g++-${GCC_VERSION} \ -# Set JDK8 as the default Java -&& update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java - -ARG CMAKE_VERSION=3.20.5 - -# Install CMake -RUN cd /tmp \ - && curl -fsSLO --compressed "https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION.tar.gz" -o /tmp/cmake-$CMAKE_VERSION.tar.gz \ - && tar -xvzf /tmp/cmake-$CMAKE_VERSION.tar.gz && cd /tmp/cmake-$CMAKE_VERSION \ - && /tmp/cmake-$CMAKE_VERSION/bootstrap \ - --system-curl \ - --parallel=${PARALLEL_LEVEL} \ - && make install -j${PARALLEL_LEVEL} \ - && cd /tmp && rm -rf /tmp/cmake-$CMAKE_VERSION* - diff --git a/udf-examples/README.md b/udf-examples/README.md deleted file mode 100644 index 15e258b3f74..00000000000 --- a/udf-examples/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# RAPIDS Accelerated UDF Examples - -This project contains sample implementations of RAPIDS accelerated -user-defined functions. See the -[RAPIDS accelerated UDF documentation](../docs/additional-functionality/rapids-udfs.md) for details -on how RAPIDS accelerated UDFs work and guidelines for creating them. - -## Building the Native Code Examples - -Some of the UDF examples use native code in their implementation. -Building the native code requires a libcudf build environment, so these -examples do not build by default. The `udf-native-examples` Maven profile -can be used to include the native UDF examples in the build, i.e.: specify - `-Pudf-native-examples` on the `mvn` command-line. - -## Creating a libcudf Build Environment - -The `Dockerfile` in this directory can be used to setup a Docker image that -provides a libcudf build environment. This repository will either need to be -cloned or mounted into a container using that Docker image. -The `Dockerfile` contains build arguments to control the Linux version, -CUDA version, and other settings. See the top of the `Dockerfile` for details. diff --git a/udf-examples/pom.xml b/udf-examples/pom.xml deleted file mode 100644 index f44cce94e81..00000000000 --- a/udf-examples/pom.xml +++ /dev/null @@ -1,264 +0,0 @@ - - - - 4.0.0 - - - com.nvidia - rapids-4-spark-parent - 22.04.0-SNAPSHOT - - rapids-4-spark-udf-examples_2.12 - RAPIDS Accelerator for Apache Spark UDF Examples - Sample implementations of RAPIDS accelerated - user defined functions for use with the RAPIDS Accelerator - for Apache Spark - 22.04.0-SNAPSHOT - - - ${project.build.directory}/cpp-build - OFF - - ALL - ON - 10 - OFF - - - - - - ai.rapids - cudf - ${cuda.version} - - - org.apache.spark - spark-hive_${scala.binary.version} - - - org.scala-lang - scala-library - - - com.nvidia - - rapids-4-spark-sql_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - provided - - - - - - - - ${project.build.directory}/extra-resources - true - - - - - maven-assembly-plugin - - - jar - - - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.rat - apache-rat-plugin - - - - - - - - dbdeps - - - databricks - - - - - org.apache.spark - spark-sql_${scala.binary.version} - provided - - - org.apache.hive - hive-exec - ${spark.version} - provided - - - org.apache.hive - hive-serde - ${spark.version} - provided - - - org.apache.commons - commons-io - ${spark.version} - provided - - - org.apache.hadoop - hadoop-common - ${spark.version} - provided - - - org.apache.hive - hive-storage-api - ${spark.version} - provided - - - - - release311cdh - - - buildver - 311cdh - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark311cdh.version} - - - org.apache.curator - curator-recipes - - - provided - - - org.apache.spark - spark-hive_${scala.binary.version} - ${spark311cdh.version} - - - org.apache.spark - spark-core_${scala.binary.version} - - - - - org.apache.curator - curator-recipes - 4.3.0.7.2.7.0-184 - provided - - - - - udf-native-examples - - - - ${project.build.directory}/native-deps/ - - - - - maven-antrun-plugin - - - cmake - validate - - - - - - - - - - - - - - - - - - - - - run - - - - - - maven-resources-plugin - - - copy-native-libs - validate - - copy-resources - - - true - ${project.build.directory}/native-deps/${os.arch}/${os.name} - - - ${udf.native.build.path} - - libudfexamplesjni.so - - - - - - - - - - - - diff --git a/udf-examples/src/main/cpp/CMakeLists.txt b/udf-examples/src/main/cpp/CMakeLists.txt deleted file mode 100755 index c97dd926a50..00000000000 --- a/udf-examples/src/main/cpp/CMakeLists.txt +++ /dev/null @@ -1,175 +0,0 @@ -#============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) - -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/RAPIDS.cmake - ${CMAKE_BINARY_DIR}/RAPIDS.cmake) -include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) - -include(rapids-cmake) -include(rapids-cpm) -include(rapids-cuda) -include(rapids-export) -include(rapids-find) - -# Use GPU_ARCHS if it is defined -if(DEFINED GPU_ARCHS) - set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}") -endif() -rapids_cuda_init_architectures(UDFEXAMPLESJNI) - -project(UDFEXAMPLESJNI VERSION 22.04.0 LANGUAGES C CXX CUDA) - -option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) -option(BUILD_UDF_BENCHMARKS "Build the benchmarks" OFF) - -################################################################################################### -# - build type ------------------------------------------------------------------------------------ - -# Set a default build type if none was specified -set(DEFAULT_BUILD_TYPE "Release") - -################################################################################################### -# - compiler options ------------------------------------------------------------------------------ - -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_COMPILER $ENV{CXX}) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) - -if(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations") -endif(CMAKE_COMPILER_IS_GNUCXX) - -if(CMAKE_CUDA_COMPILER_VERSION) - # Compute the version. from CMAKE_CUDA_COMPILER_VERSION - string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${CMAKE_CUDA_COMPILER_VERSION}) - string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${CMAKE_CUDA_COMPILER_VERSION}) - set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.") - mark_as_advanced(CUDA_VERSION) -endif() - -message(STATUS "CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}") -message(STATUS "CUDA_VERSION_MINOR: ${CUDA_VERSION_MINOR}") -message(STATUS "CUDA_VERSION: ${CUDA_VERSION}") - -# Always set this convenience variable -set(CUDA_VERSION_STRING "${CUDA_VERSION}") - -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w --expt-extended-lambda --expt-relaxed-constexpr") - -#################################################################################################### -# - cudf ------------------------------------------------------------------------------------------- - -# Ensure CUDA runtime is dynamic despite statically linking Arrow in libcudf -set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) - -rapids_cpm_init() -rapids_cpm_find(cudf 22.04.00 - CPM_ARGS - GIT_REPOSITORY https://github.com/rapidsai/cudf.git - GIT_TAG branch-22.04 - GIT_SHALLOW TRUE - SOURCE_SUBDIR cpp - OPTIONS "BUILD_TESTS OFF" - "BUILD_BENCHMARKS OFF" - "CUDF_USE_ARROW_STATIC ON" - "JITIFY_USE_CACHE ON" - "CUDA_STATIC_RUNTIME OFF" - "DISABLE_DEPRECATION_WARNING ON" - "AUTO_DETECT_CUDA_ARCHITECTURES OFF" - ) - -################################################################################################### -# - benchmarks ------------------------------------------------------------------------------------ - -if(BUILD_UDF_BENCHMARKS) - # Find or install GoogleBench - CPMFindPackage(NAME benchmark - VERSION 1.5.2 - GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.5.2 - GIT_SHALLOW TRUE - OPTIONS "BENCHMARK_ENABLE_TESTING OFF" - "BENCHMARK_ENABLE_INSTALL OFF") - add_subdirectory(benchmarks) -endif() - -################################################################################################### -# - find JNI ------------------------------------------------------------------------------------- - -find_package(JNI REQUIRED) -if(JNI_FOUND) - message(STATUS "JDK with JNI in ${JNI_INCLUDE_DIRS}") -else() - message(FATAL_ERROR "JDK with JNI not found, please check your settings.") -endif(JNI_FOUND) - -################################################################################################### -# - library paths --------------------------------------------------------------------------------- - -# CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc -link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" - "${CMAKE_BINARY_DIR}/lib") - - -################################################################################################### -# - library targets ------------------------------------------------------------------------------- - -set(SOURCE_FILES - "src/CosineSimilarityJni.cpp" - "src/StringWordCountJni.cpp" - "src/cosine_similarity.cu" - "src/string_word_count.cu") - -add_library(udfexamplesjni SHARED ${SOURCE_FILES}) - -#Override RPATH for udfexamplesjni -SET_TARGET_PROPERTIES(udfexamplesjni PROPERTIES BUILD_RPATH "\$ORIGIN") - -################################################################################################### -# - build options --------------------------------------------------------------------------------- - -option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) -if(PER_THREAD_DEFAULT_STREAM) - message(STATUS "Using per-thread default stream") - target_compile_definitions(udfexamplesjni PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM) -endif(PER_THREAD_DEFAULT_STREAM) - -target_include_directories(udfexamplesjni PRIVATE ${JNI_INCLUDE_DIRS}) - -################################################################################################### -# - rmm logging level ----------------------------------------------------------------------------- - -set(RMM_LOGGING_LEVEL "OFF" CACHE STRING "Choose the logging level.") -# Set the possible values of build type for cmake-gui -set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS - "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF") -message(STATUS "RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.") - -target_compile_definitions(udfexamplesjni - PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}) - -################################################################################################### -# - link libraries -------------------------------------------------------------------------------- - -target_link_libraries(udfexamplesjni cudf::cudf) diff --git a/udf-examples/src/main/cpp/benchmarks/CMakeLists.txt b/udf-examples/src/main/cpp/benchmarks/CMakeLists.txt deleted file mode 100644 index 1fe05394422..00000000000 --- a/udf-examples/src/main/cpp/benchmarks/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -#============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -# Use an OBJECT library so we only compile these helper source files only once -add_library(udf_benchmark_common OBJECT - synchronization/synchronization.cpp) - -target_link_libraries(udf_benchmark_common PUBLIC benchmark::benchmark cudf) - -target_include_directories(udf_benchmark_common - PUBLIC "$" - "$" - "$/src") - -function(ConfigureBench CMAKE_BENCH_NAME) - add_executable(${CMAKE_BENCH_NAME} ${ARGN}) - set_target_properties(${CMAKE_BENCH_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") - target_link_libraries(${CMAKE_BENCH_NAME} - PRIVATE udf_benchmark_common udfexamplesjni benchmark::benchmark_main) -endfunction() - -ConfigureBench(COSINE_SIMILARITY_BENCH cosine_similarity/cosine_similarity_benchmark.cpp) diff --git a/udf-examples/src/main/cpp/benchmarks/cosine_similarity/cosine_similarity_benchmark.cpp b/udf-examples/src/main/cpp/benchmarks/cosine_similarity/cosine_similarity_benchmark.cpp deleted file mode 100644 index 5d852ab251a..00000000000 --- a/udf-examples/src/main/cpp/benchmarks/cosine_similarity/cosine_similarity_benchmark.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "benchmarks/fixture/benchmark_fixture.hpp" -#include "benchmarks/synchronization/synchronization.hpp" -#include "cosine_similarity.hpp" - -#include -#include -#include -#include - -static void cosine_similarity_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 0; - int const max_rowlen = 1 << 12; - int const len_mult = 8; - for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < std::numeric_limits::max()) { - b->Args({row_count, rowlen}); - } - } - } -} - -static void BM_cosine_similarity(benchmark::State& state) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const list_len{static_cast(state.range(1))}; - - auto val_start = cudf::make_fixed_width_scalar(1.0f); - auto val_step = cudf::make_fixed_width_scalar(-1.0f); - auto child_rows = n_rows * list_len; - auto col1_child = cudf::sequence(child_rows, *val_start); - auto col2_child = cudf::sequence(child_rows, *val_start, *val_step); - auto offset_start = cudf::make_fixed_width_scalar(static_cast(0)); - auto offset_step = cudf::make_fixed_width_scalar(list_len); - auto offsets = cudf::sequence(n_rows + 1, *offset_start, *offset_step); - - auto col1 = cudf::make_lists_column( - n_rows, - std::make_unique(*offsets), - std::move(col1_child), - 0, - cudf::create_null_mask(n_rows, cudf::mask_state::ALL_VALID)); - auto lcol1 = cudf::lists_column_view(*col1); - auto col2 = cudf::make_lists_column( - n_rows, - std::move(offsets), - std::move(col2_child), - 0, - cudf::create_null_mask(n_rows, cudf::mask_state::ALL_VALID)); - auto lcol2 = cudf::lists_column_view(*col2); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto output = cosine_similarity(lcol1, lcol2); - } - - state.SetBytesProcessed(state.iterations() * child_rows * sizeof(float)); -} - -class CosineSimilarity : public native_udf::benchmark { -}; - -BENCHMARK_DEFINE_F(CosineSimilarity, cosine_similarity) -(::benchmark::State& state) { BM_cosine_similarity(state); } - -BENCHMARK_REGISTER_F(CosineSimilarity, cosine_similarity) - ->Apply(cosine_similarity_bench_args) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/udf-examples/src/main/cpp/benchmarks/fixture/benchmark_fixture.hpp b/udf-examples/src/main/cpp/benchmarks/fixture/benchmark_fixture.hpp deleted file mode 100644 index 32766cf682a..00000000000 --- a/udf-examples/src/main/cpp/benchmarks/fixture/benchmark_fixture.hpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace native_udf { - -namespace { -// memory resource factory helpers -inline auto make_cuda() { return std::make_shared(); } - -inline auto make_pool() -{ - return rmm::mr::make_owning_wrapper(make_cuda()); -} -} // namespace - -/** - * @brief Google Benchmark fixture for native UDF benchmarks - * - * Native UDF benchmarks should use a fixture derived from this fixture class to - * ensure that the RAPIDS Memory Manager pool mode is used in benchmarks, which - * eliminates memory allocation / deallocation performance overhead from the - * benchmark. - * - * The SetUp and TearDown methods of this fixture initialize RMM into pool mode - * and finalize it, respectively. These methods are called automatically by - * Google Benchmark - * - * Example: - * - * template - * class my_benchmark : public native_udf::benchmark { - * public: - * using TypeParam = T; - * }; - * - * Then: - * - * BENCHMARK_TEMPLATE_DEFINE_F(my_benchmark, my_test_name, int) - * (::benchmark::State& state) { - * for (auto _ : state) { - * // benchmark stuff - * } - * } - * - * BENCHMARK_REGISTER_F(my_benchmark, my_test_name)->Range(128, 512); - */ -class benchmark : public ::benchmark::Fixture { - public: - virtual void SetUp(const ::benchmark::State& state) - { - mr = make_pool(); - rmm::mr::set_current_device_resource(mr.get()); // set default resource to pool - } - - virtual void TearDown(const ::benchmark::State& state) - { - // reset default resource to the initial resource - rmm::mr::set_current_device_resource(nullptr); - mr.reset(); - } - - // eliminate partial override warnings (see benchmark/benchmark.h) - virtual void SetUp(::benchmark::State& st) { SetUp(const_cast(st)); } - virtual void TearDown(::benchmark::State& st) - { - TearDown(const_cast(st)); - } - - std::shared_ptr mr; -}; - -} // namespace native_udf diff --git a/udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.cpp b/udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.cpp deleted file mode 100644 index 8802b409052..00000000000 --- a/udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "synchronization.hpp" - -#include - -#include -#include - -cuda_event_timer::cuda_event_timer(benchmark::State& state, - bool flush_l2_cache, - rmm::cuda_stream_view stream) - : stream(stream), p_state(&state) -{ - // flush all of L2$ - if (flush_l2_cache) { - int current_device = 0; - CUDA_TRY(cudaGetDevice(¤t_device)); - - int l2_cache_bytes = 0; - CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); - - if (l2_cache_bytes > 0) { - const int memset_value = 0; - rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); - CUDA_TRY( - cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); - } - } - - CUDA_TRY(cudaEventCreate(&start)); - CUDA_TRY(cudaEventCreate(&stop)); - CUDA_TRY(cudaEventRecord(start, stream.value())); -} - -cuda_event_timer::~cuda_event_timer() -{ - CUDA_TRY(cudaEventRecord(stop, stream.value())); - CUDA_TRY(cudaEventSynchronize(stop)); - - float milliseconds = 0.0f; - CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop)); - p_state->SetIterationTime(milliseconds / (1000.0f)); - CUDA_TRY(cudaEventDestroy(start)); - CUDA_TRY(cudaEventDestroy(stop)); -} diff --git a/udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.hpp b/udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.hpp deleted file mode 100644 index 7a5fbb36a83..00000000000 --- a/udf-examples/src/main/cpp/benchmarks/synchronization/synchronization.hpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file synchronization.hpp - * @brief This is the header file for `cuda_event_timer`. - */ - -/** - * @brief This class serves as a wrapper for using `cudaEvent_t` as the user - * defined timer within the framework of google benchmark - * (https://github.com/google/benchmark). - * - * It is built on top of the idea of Resource acquisition is initialization - * (RAII). In the following we show a minimal example of how to use this class. - - #include - - static void sample_cuda_benchmark(benchmark::State& state) { - - for (auto _ : state){ - - rmm::cuda_stream_view stream{}; // default stream, could be another stream - - // Create (Construct) an object of this class. You HAVE to pass in the - // benchmark::State object you are using. It measures the time from its - // creation to its destruction that is spent on the specified CUDA stream. - // It also clears the L2 cache by cudaMemset'ing a device buffer that is of - // the size of the L2 cache (if flush_l2_cache is set to true and there is - // an L2 cache on the current device). - cuda_event_timer raii(state, true, stream); // flush_l2_cache = true - - // Now perform the operations that is to be benchmarked - sample_kernel<<<1, 256, 0, stream.value()>>>(); // Possibly launching a CUDA kernel - - } - } - - // Register the function as a benchmark. You will need to set the `UseManualTime()` - // flag in order to use the timer embedded in this class. - BENCHMARK(sample_cuda_benchmark)->UseManualTime(); - - - */ - -#ifndef UDF_BENCH_SYNCHRONIZATION_H -#define UDF_BENCH_SYNCHRONIZATION_H - -// Google Benchmark library -#include - -#include - -#include - -class cuda_event_timer { - public: - /** - * @brief This c'tor clears the L2$ by cudaMemset'ing a buffer of L2$ size - * and starts the timer. - * - * @param[in,out] state This is the benchmark::State whose timer we are going - * to update. - * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before - * every iteration. - * @param[in] stream_ The CUDA stream we are measuring time on. - */ - cuda_event_timer(benchmark::State& state, - bool flush_l2_cache, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - // The user must provide a benchmark::State object to set - // the timer so we disable the default c'tor. - cuda_event_timer() = delete; - - // The d'tor stops the timer and performs a synchronization. - // Time of the benchmark::State object provided to the c'tor - // will be set to the value given by `cudaEventElapsedTime`. - ~cuda_event_timer(); - - private: - cudaEvent_t start; - cudaEvent_t stop; - rmm::cuda_stream_view stream; - benchmark::State* p_state; -}; - -#endif diff --git a/udf-examples/src/main/cpp/src/CosineSimilarityJni.cpp b/udf-examples/src/main/cpp/src/CosineSimilarityJni.cpp deleted file mode 100644 index e063d8dff16..00000000000 --- a/udf-examples/src/main/cpp/src/CosineSimilarityJni.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -#include "cosine_similarity.hpp" - -namespace { - -constexpr char const* RUNTIME_ERROR_CLASS = "java/lang/RuntimeException"; -constexpr char const* ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException"; - -/** - * @brief Throw a Java exception - * - * @param env The Java environment - * @param class_name The fully qualified Java class name of the exception - * @param msg The message string to associate with the exception - */ -void throw_java_exception(JNIEnv* env, char const* class_name, char const* msg) { - jclass ex_class = env->FindClass(class_name); - if (ex_class != NULL) { - env->ThrowNew(ex_class, msg); - } -} - -} // anonymous namespace - -extern "C" { - -/** - * @brief The native implementation of CosineSimilarity.cosineSimilarity which - * computes the cosine similarity between two LIST(FLOAT32) columns as a FLOAT32 - * columnar result. - * - * @param env The Java environment - * @param j_view1 The address of the cudf column view of the first LIST column - * @param j_view2 The address of the cudf column view of the second LIST column - * @return The address of the cudf column containing the FLOAT32 results - */ -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_udf_java_CosineSimilarity_cosineSimilarity(JNIEnv* env, jclass, - jlong j_view1, - jlong j_view2) { - // Use a try block to translate C++ exceptions into Java exceptions to avoid - // crashing the JVM if a C++ exception occurs. - try { - // turn the addresses into column_view pointers - auto v1 = reinterpret_cast(j_view1); - auto v2 = reinterpret_cast(j_view2); - if (v1->type().id() != v2->type().id() || v1->type().id() != cudf::type_id::LIST) { - throw_java_exception(env, ILLEGAL_ARG_CLASS, "inputs not list columns"); - return 0; - } - - // run the GPU kernel to compute the cosine similarity - auto lv1 = cudf::lists_column_view(*v1); - auto lv2 = cudf::lists_column_view(*v2); - std::unique_ptr result = cosine_similarity(lv1, lv2); - - // take ownership of the column and return the column address to Java - return reinterpret_cast(result.release()); - } catch (std::bad_alloc const& e) { - auto msg = std::string("Unable to allocate native memory: ") + - (e.what() == nullptr ? "" : e.what()); - throw_java_exception(env, RUNTIME_ERROR_CLASS, msg.c_str()); - } catch (std::invalid_argument const& e) { - throw_java_exception(env, ILLEGAL_ARG_CLASS, e.what() == nullptr ? "" : e.what()); - } catch (std::exception const& e) { - auto msg = e.what() == nullptr ? "" : e.what(); - throw_java_exception(env, RUNTIME_ERROR_CLASS, msg); - } - return 0; -} - -} diff --git a/udf-examples/src/main/cpp/src/StringWordCountJni.cpp b/udf-examples/src/main/cpp/src/StringWordCountJni.cpp deleted file mode 100644 index 678bd6af2da..00000000000 --- a/udf-examples/src/main/cpp/src/StringWordCountJni.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include - -#include "string_word_count.hpp" - -namespace { - -constexpr char const* RUNTIME_ERROR_CLASS = "java/lang/RuntimeException"; - -/** - * @brief Throw a Java exception - * - * @param env The Java environment - * @param class_name The fully qualified Java class name of the exception - * @param msg The message string to associate with the exception - */ -void throw_java_exception(JNIEnv* env, char const* class_name, char const* msg) { - jclass ex_class = env->FindClass(class_name); - if (ex_class != NULL) { - env->ThrowNew(ex_class, msg); - } -} - -} // anonymous namespace - -extern "C" { - -/** - * @brief The native implementation of StringWordCount.countWords which counts the - * number of words per string in a string column. - * - * @param env The Java environment - * @param j_strings The address of the cudf column view of the strings column - * @return The address of the cudf column containing the word counts - */ -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_udf_hive_StringWordCount_countWords(JNIEnv* env, jclass, - jlong j_strings) { - // Use a try block to translate C++ exceptions into Java exceptions to avoid - // crashing the JVM if a C++ exception occurs. - try { - // turn the addresses into column_view pointers - auto strs = reinterpret_cast(j_strings); - - // run the GPU kernel to compute the word counts - std::unique_ptr result = string_word_count(*strs); - - // take ownership of the column and return the column address to Java - return reinterpret_cast(result.release()); - } catch (std::bad_alloc const& e) { - auto msg = std::string("Unable to allocate native memory: ") + - (e.what() == nullptr ? "" : e.what()); - throw_java_exception(env, RUNTIME_ERROR_CLASS, msg.c_str()); - } catch (std::exception const& e) { - auto msg = e.what() == nullptr ? "" : e.what(); - throw_java_exception(env, RUNTIME_ERROR_CLASS, msg); - } - return 0; -} - -} diff --git a/udf-examples/src/main/cpp/src/cosine_similarity.cu b/udf-examples/src/main/cpp/src/cosine_similarity.cu deleted file mode 100644 index 3ad61bd43d6..00000000000 --- a/udf-examples/src/main/cpp/src/cosine_similarity.cu +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cosine_similarity.hpp" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include - -namespace { - -/** - * @brief Functor for computing the cosine similarity between two list of float columns - */ -struct cosine_similarity_functor { - float const* const v1; - float const* const v2; - int32_t const* const v1_offsets; - int32_t const* const v2_offsets; - - // This kernel executes thread-per-row which should be fine for relatively short lists - // but may need to be revisited for performance if operating on long lists. - __device__ float operator()(cudf::size_type row_idx) { - auto const v1_start_idx = v1_offsets[row_idx]; - auto const v1_num_elems = v1_offsets[row_idx + 1] - v1_start_idx; - auto const v2_start_idx = v2_offsets[row_idx]; - auto const v2_num_elems = v2_offsets[row_idx + 1] - v2_start_idx; - auto const num_elems = std::min(v1_num_elems, v2_num_elems); - double mag1 = 0; - double mag2 = 0; - double dot_product = 0; - for (auto i = 0; i < num_elems; i++) { - float const f1 = v1[v1_start_idx + i]; - mag1 += f1 * f1; - float const f2 = v2[v2_start_idx + i]; - mag2 += f2 * f2; - dot_product += f1 * f2; - } - mag1 = std::sqrt(mag1); - mag2 = std::sqrt(mag2); - return static_cast(dot_product / (mag1 * mag2)); - } -}; - -} // anonymous namespace - -/** - * @brief Compute the cosine similarity between two LIST of FLOAT32 columns - * - * The input vectors must have matching shapes, i.e.: same row count and same number of - * list elements per row. A null list row is supported, but null float entries within a - * list are not supported. - * - * @param lv1 The first LIST of FLOAT32 column view - * @param lv2 The second LIST of FLOAT32 column view - * @return A FLOAT32 column containing the cosine similarity corresponding to each input row - */ -std::unique_ptr cosine_similarity(cudf::lists_column_view const& lv1, - cudf::lists_column_view const& lv2) { - // sanity-check the input types - if (lv1.child().type().id() != lv2.child().type().id() || - lv1.child().type().id() != cudf::type_id::FLOAT32) { - throw std::invalid_argument("inputs are not lists of floats"); - } - - // sanity check the input shape - auto const row_count = lv1.size(); - if (row_count != lv2.size()) { - throw std::invalid_argument("input row counts do not match"); - } - if (row_count == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::FLOAT32}); - } - if (lv1.child().null_count() != 0 || lv2.child().null_count() != 0) { - throw std::invalid_argument("null floats are not supported"); - } - - auto const stream = rmm::cuda_stream_default; - auto d_view1_ptr = cudf::column_device_view::create(lv1.parent()); - auto d_lists1 = cudf::detail::lists_column_device_view(*d_view1_ptr); - auto d_view2_ptr = cudf::column_device_view::create(lv2.parent()); - auto d_lists2 = cudf::detail::lists_column_device_view(*d_view2_ptr); - bool const are_offsets_equal = - thrust::all_of(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(row_count), - [d_lists1, d_lists2] __device__(cudf::size_type idx) { - auto ldv1 = cudf::list_device_view(d_lists1, idx); - auto ldv2 = cudf::list_device_view(d_lists2, idx); - return ldv1.is_null() || ldv2.is_null() || ldv1.size() == ldv2.size(); - }); - if (not are_offsets_equal) { - throw std::invalid_argument("input list lengths do not match for every row"); - } - - // allocate the vector of float results - rmm::device_uvector float_results(row_count, stream); - - // compute the cosine similarity - auto const lv1_data = lv1.child().data(); - auto const lv2_data = lv2.child().data(); - auto const lv1_offsets = lv1.offsets().data(); - auto const lv2_offsets = lv2.offsets().data(); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(row_count), - float_results.data(), - cosine_similarity_functor({lv1_data, lv2_data, lv1_offsets, lv2_offsets})); - - // the validity of the output is the bitwise-and of the two input validity masks - auto [null_mask, null_count] = cudf::bitmask_and(cudf::table_view({lv1.parent(), lv2.parent()})); - - return std::make_unique(cudf::data_type{cudf::type_id::FLOAT32}, - row_count, - float_results.release(), - std::move(null_mask), - null_count); -} diff --git a/udf-examples/src/main/cpp/src/cosine_similarity.hpp b/udf-examples/src/main/cpp/src/cosine_similarity.hpp deleted file mode 100644 index 9e23d0ff654..00000000000 --- a/udf-examples/src/main/cpp/src/cosine_similarity.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -/** - * @brief Compute the cosine similarity between two LIST of FLOAT32 columns - * - * The input vectors must have matching shapes, i.e.: same row count and same number of - * list elements per row. A null list row is supported, but null float entries within a - * list are not supported. - * - * @param lv1 The first LIST of FLOAT32 column view - * @param lv2 The second LIST of FLOAT32 column view - * @return A FLOAT32 column containing the cosine similarity corresponding to each input row - */ -std::unique_ptr cosine_similarity(cudf::lists_column_view const& lv1, - cudf::lists_column_view const& lv2); diff --git a/udf-examples/src/main/cpp/src/string_word_count.cu b/udf-examples/src/main/cpp/src/string_word_count.cu deleted file mode 100644 index 8891ceb9c45..00000000000 --- a/udf-examples/src/main/cpp/src/string_word_count.cu +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "string_word_count.hpp" - -#include -#include -#include - -#include -#include -#include - -#include -#include - -namespace { - -// count the words separated by whitespace characters -__device__ cudf::size_type count_words(cudf::column_device_view const& d_strings, - cudf::size_type idx) { - if (d_strings.is_null(idx)) return 0; - cudf::string_view const d_str = d_strings.element(idx); - cudf::size_type word_count = 0; - // run of whitespace is considered a single delimiter - bool spaces = true; - auto itr = d_str.begin(); - while (itr != d_str.end()) { - cudf::char_utf8 ch = *itr; - if (spaces == (ch <= ' ')) { - itr++; - } else { - word_count += static_cast(spaces); - spaces = !spaces; - } - } - - return word_count; -} - - -} // anonymous namespace - -/** - * @brief Count the words in a string using whitespace as word boundaries - * - * @param strs The column containing the strings - * @param stream The CUDA stream to use - * @return The INT32 column containing the word count results per string - */ -std::unique_ptr string_word_count(cudf::column_view const& strs) { - auto strings_count = strs.size(); - if (strings_count == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); - } - - // the validity of the output matches the validity of the input - rmm::device_buffer null_mask = cudf::copy_bitmask(strs); - - // allocate the column that will contain the word count results - std::unique_ptr result = - cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT32}, - strs.size(), - std::move(null_mask), - strs.null_count()); - - // compute the word counts, writing into the result column data buffer - auto stream = rmm::cuda_stream_default; - auto strs_device_view = cudf::column_device_view::create(strs, stream); - auto d_strs_view = *strs_device_view; - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - result->mutable_view().data(), - [d_strs_view] __device__(cudf::size_type idx) { return count_words(d_strs_view, idx); }); - - return result; -} diff --git a/udf-examples/src/main/cpp/src/string_word_count.hpp b/udf-examples/src/main/cpp/src/string_word_count.hpp deleted file mode 100644 index ee9b31991e7..00000000000 --- a/udf-examples/src/main/cpp/src/string_word_count.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -/** - * @brief Count the words in a string separated by whitespace - * - * @param strs The column containing the strings to be examined - * @return The INT32 column containing the word count results for each string - */ -std::unique_ptr string_word_count(cudf::column_view const& strs); diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/DecimalFraction.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/DecimalFraction.java deleted file mode 100644 index 4c9b48ccf60..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/DecimalFraction.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.hive; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.Scalar; -import com.nvidia.spark.RapidsUDF; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; - -import java.math.BigDecimal; - - -/** - * A simple HiveGenericUDF demo for DecimalType, which extracts and returns - * the fraction part of the input Decimal data. So, the output data has the - * same precision and scale as the input one. - */ -public class DecimalFraction extends GenericUDF implements RapidsUDF { - private transient PrimitiveObjectInspector inputOI; - - @Override - public String getDisplayString(String[] strings) { - return getStandardDisplayString("DecimalFraction", strings); - } - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentException("One argument is supported, found: " + arguments.length); - } - if (!(arguments[0] instanceof PrimitiveObjectInspector)) { - throw new UDFArgumentException("Unsupported argument type: " + arguments[0].getTypeName()); - } - - inputOI = (PrimitiveObjectInspector) arguments[0]; - if (inputOI.getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.DECIMAL) { - throw new UDFArgumentException("Unsupported primitive type: " + inputOI.getPrimitiveCategory()); - } - - DecimalTypeInfo inputTypeInfo = (DecimalTypeInfo) inputOI.getTypeInfo(); - - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(inputTypeInfo); - } - - @Override - public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException { - if (arguments[0] == null || arguments[0].get() == null) { - return null; - } - - Object input = arguments[0].get(); - HiveDecimalWritable decimalWritable = (HiveDecimalWritable) inputOI.getPrimitiveWritableObject(input); - BigDecimal decimalInput = decimalWritable.getHiveDecimal().bigDecimalValue(); - BigDecimal decimalResult = decimalInput.subtract(new BigDecimal(decimalInput.toBigInteger())); - HiveDecimalWritable result = new HiveDecimalWritable(decimalWritable); - result.set(HiveDecimal.create(decimalResult)); - - return result; - } - - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector input = args[0]; - if (!input.getType().isDecimalType()) { - throw new IllegalArgumentException("Argument type is not a decimal column: " + - input.getType()); - } - - try (Scalar nullScalar = Scalar.fromNull(input.getType()); - ColumnVector nullPredicate = input.isNull(); - ColumnVector integral = input.floor(); - ColumnVector fraction = input.sub(integral, input.getType())) { - return nullPredicate.ifElse(nullScalar, fraction); - } - } -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java deleted file mode 100644 index 89cbfdcaa6a..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.hive; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import ai.rapids.cudf.NativeDepsLoader; -import com.nvidia.spark.RapidsUDF; -import com.nvidia.spark.rapids.udf.java.NativeUDFExamplesLoader; -import org.apache.hadoop.hive.ql.exec.UDF; - -import java.io.IOException; - -/** - * A user-defined function (UDF) that counts the words in a string. - * This avoids the manifestation of intermediate results required when - * splitting the string on whitespace and counting the split results. - *

- * This class demonstrates how to implement a Hive UDF with a RAPIDS - * implementation that uses custom native code. - */ -public class StringWordCount extends UDF implements RapidsUDF { - private volatile boolean isNativeCodeLoaded = false; - - /** Row-by-row implementation that executes on the CPU */ - public Integer evaluate(String str) { - if (str == null) { - return null; - } - - int numWords = 0; - // run of whitespace is considered a single delimiter - boolean spaces = true; - for (int idx = 0; idx < str.length(); idx++) { - char ch = str.charAt(idx); - if (spaces != (ch <= ' ')) { - if (spaces) { - numWords++; - } - spaces = !spaces; - } - } - return numWords; - } - - /** Columnar implementation that runs on the GPU */ - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - // The CPU implementation takes a single string argument, so similarly - // there should only be one column argument of type STRING. - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector strs = args[0]; - if (!strs.getType().equals(DType.STRING)) { - throw new IllegalArgumentException("type mismatch, expected strings but found " + - strs.getType()); - } - - // Load the native code if it has not been already loaded. This is done here - // rather than in a static code block since the driver may not have the - // required CUDA environment. - NativeUDFExamplesLoader.ensureLoaded(); - - return new ColumnVector(countWords(strs.getNativeView())); - } - - private static native long countWords(long stringsView); -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java deleted file mode 100644 index 4aa1dae0af7..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.hive; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import ai.rapids.cudf.Scalar; -import com.nvidia.spark.RapidsUDF; -import org.apache.hadoop.hive.ql.exec.UDF; - -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; - -/** - * A Hive user-defined function (UDF) that decodes URL-encoded strings. - * This class demonstrates how to implement a simple Hive UDF that also - * provides a RAPIDS implementation that can run on the GPU when the query - * is executed with the RAPIDS Accelerator for Apache Spark. - */ -public class URLDecode extends UDF implements RapidsUDF { - - /** Row-by-row implementation that executes on the CPU */ - public String evaluate(String s) { - String result = null; - if (s != null) { - try { - result = URLDecoder.decode(s, "utf-8"); - } catch (IllegalArgumentException ignored) { - result = s; - } catch (UnsupportedEncodingException e) { - // utf-8 is a builtin, standard encoding, so this should never happen - throw new RuntimeException(e); - } - } - return result; - } - - /** Columnar implementation that runs on the GPU */ - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - // The CPU implementation takes a single string argument, so similarly - // there should only be one column argument of type STRING. - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector input = args[0]; - if (!input.getType().equals(DType.STRING)) { - throw new IllegalArgumentException("Argument type is not a string column: " + - input.getType()); - } - - // The cudf urlDecode does not convert '+' to a space, so do that as a pre-pass first. - // All intermediate results are closed to avoid leaking GPU resources. - try (Scalar plusScalar = Scalar.fromString("+"); - Scalar spaceScalar = Scalar.fromString(" "); - ColumnVector replaced = input.stringReplace(plusScalar, spaceScalar)) { - return replaced.urlDecode(); - } - } -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java deleted file mode 100644 index fbf302440c6..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.hive; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import com.nvidia.spark.RapidsUDF; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.io.Text; - -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; - -/** - * A Hive user-defined function (UDF) that URL-encodes strings. - * This class demonstrates how to implement a Hive GenericUDF that also - * provides a RAPIDS implementation that can run on the GPU when the query - * is executed with the RAPIDS Accelerator for Apache Spark. - */ -public class URLEncode extends GenericUDF implements RapidsUDF { - private transient PrimitiveObjectInspectorConverter.TextConverter converter; - private final Text textResult = new Text(); - - /** Standard getDisplayString method for implementing GenericUDF */ - @Override - public String getDisplayString(String[] children) { - return getStandardDisplayString("urlencode", children); - } - - /** Standard initialize method for implementing GenericUDF for a single string parameter */ - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentException("One argument is supported, found: " + arguments.length); - } - if (!(arguments[0] instanceof PrimitiveObjectInspector)) { - throw new UDFArgumentException("Unsupported argument type: " + arguments[0].getTypeName()); - } - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) arguments[0]; - switch (poi.getPrimitiveCategory()) { - case STRING: - case CHAR: - case VARCHAR: - break; - default: - throw new UDFArgumentException("Unsupported primitive type: " + poi.getPrimitiveCategory()); - } - - converter = new PrimitiveObjectInspectorConverter.TextConverter(poi); - return PrimitiveObjectInspectorFactory.writableStringObjectInspector; - } - - /** Row-by-row implementation that executes on the CPU */ - @Override - public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException { - Text text = converter.convert(arguments[0].get()); - if (text == null) { - return null; - } - String encoded; - try { - encoded = URLEncoder.encode(text.toString(), "utf-8") - .replace("+", "%20") - .replace("*", "%2A") - .replace("%7E", "~"); - } catch (UnsupportedEncodingException e) { - // utf-8 is a builtin, standard encoding, so this should never happen - throw new RuntimeException(e); - } - textResult.set(encoded); - return textResult; - } - - /** Columnar implementation that runs on the GPU */ - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - // The CPU implementation takes a single string argument, so similarly - // there should only be one column argument of type STRING. - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector input = args[0]; - if (!input.getType().equals(DType.STRING)) { - throw new IllegalArgumentException("Argument type is not a string column: " + - input.getType()); - } - - return input.urlEncode(); - } -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java deleted file mode 100644 index 1702b087d84..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.java; - -import ai.rapids.cudf.ColumnVector; -import com.nvidia.spark.RapidsUDF; -import org.apache.spark.sql.api.java.UDF2; -import scala.collection.mutable.WrappedArray; - -/** - * A Spark Java UDF that computes the cosine similarity between two float vectors. - * The input vectors must have matching shapes, i.e.: same number of elements. - * A null vector is supported, but null entries within the vector are not supported. - */ -public class CosineSimilarity - implements UDF2, WrappedArray, Float>, RapidsUDF { - - /** Row-by-row implementation that executes on the CPU */ - @Override - public Float call(WrappedArray v1, WrappedArray v2) { - if (v1 == null || v2 == null) { - return null; - } - if (v1.length() != v2.length()) { - throw new IllegalArgumentException("Array lengths must match: " + - v1.length() + " != " + v2.length()); - } - - double dotProduct = 0; - for (int i = 0; i < v1.length(); i++) { - float f1 = v1.apply(i); - float f2 = v2.apply(i); - dotProduct += f1 * f2; - } - double magProduct = magnitude(v1) * magnitude(v2); - return (float) (dotProduct / magProduct); - } - - private double magnitude(WrappedArray v) { - double sum = 0; - for (int i = 0; i < v.length(); i++) { - float x = v.apply(i); - sum += x * x; - } - return Math.sqrt(sum); - } - - /** Columnar implementation that processes data on the GPU */ - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - if (args.length != 2) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - - // Load the native code if it has not been already loaded. This is done here - // rather than in a static code block since the driver may not have the - // required CUDA environment. - NativeUDFExamplesLoader.ensureLoaded(); - - return new ColumnVector(cosineSimilarity(args[0].getNativeView(), args[1].getNativeView())); - } - - /** Native implementation that computes on the GPU */ - private static native long cosineSimilarity(long vectorView1, long vectorView2); -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/DecimalFraction.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/DecimalFraction.java deleted file mode 100644 index 2b6644841dc..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/DecimalFraction.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.java; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.Scalar; -import com.nvidia.spark.RapidsUDF; -import org.apache.spark.sql.api.java.UDF1; - -import java.math.BigDecimal; - -/** - * A simple Java UDF demo for DecimalType, which extracts and returns the - * fraction part of the input Decimal data. So, the output data has the - * same precision and scale as the input one. - */ -public class DecimalFraction implements UDF1, RapidsUDF { - - @Override - public BigDecimal call(BigDecimal dec) throws Exception { - if (dec == null) { - return null; - } - BigDecimal integral = new BigDecimal(dec.toBigInteger()); - return dec.subtract(integral); - } - - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector input = args[0]; - if (!input.getType().isDecimalType()) { - throw new IllegalArgumentException("Argument type is not a decimal column: " + - input.getType()); - } - - try (Scalar nullScalar = Scalar.fromNull(input.getType()); - ColumnVector nullPredicate = input.isNull(); - ColumnVector integral = input.floor(); - ColumnVector fraction = input.sub(integral, input.getType())) { - return nullPredicate.ifElse(nullScalar, fraction); - } - } -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/NativeUDFExamplesLoader.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/NativeUDFExamplesLoader.java deleted file mode 100644 index 844f9a0d5ef..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/NativeUDFExamplesLoader.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.java; - -import ai.rapids.cudf.NativeDepsLoader; - -import java.io.IOException; - -/** Loads the native dependencies for UDF examples with a native implementation */ -public class NativeUDFExamplesLoader { - private static boolean isLoaded; - - /** Loads native UDF code if necessary */ - public static synchronized void ensureLoaded() { - if (!isLoaded) { - try { - NativeDepsLoader.loadNativeDeps(new String[]{"udfexamplesjni"}); - isLoaded = true; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - } -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java deleted file mode 100644 index b5dbfd30b91..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.java; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import ai.rapids.cudf.Scalar; -import com.nvidia.spark.RapidsUDF; -import org.apache.spark.sql.api.java.UDF1; - -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; - -/** - * A Java user-defined function (UDF) that decodes URL-encoded strings. - * This class demonstrates how to implement a Java UDF that also - * provides a RAPIDS implementation that can run on the GPU when the query - * is executed with the RAPIDS Accelerator for Apache Spark. - */ -public class URLDecode implements UDF1, RapidsUDF { - /** Row-by-row implementation that executes on the CPU */ - @Override - public String call(String s) { - String result = null; - if (s != null) { - try { - result = URLDecoder.decode(s, "utf-8"); - } catch (IllegalArgumentException ignored) { - result = s; - } catch (UnsupportedEncodingException e) { - // utf-8 is a builtin, standard encoding, so this should never happen - throw new RuntimeException(e); - } - } - return result; - } - - /** Columnar implementation that runs on the GPU */ - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - // The CPU implementation takes a single string argument, so similarly - // there should only be one column argument of type STRING. - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector input = args[0]; - if (!input.getType().equals(DType.STRING)) { - throw new IllegalArgumentException("Argument type is not a string column: " + - input.getType()); - } - - // The cudf urlDecode does not convert '+' to a space, so do that as a pre-pass first. - // All intermediate results are closed to avoid leaking GPU resources. - try (Scalar plusScalar = Scalar.fromString("+"); - Scalar spaceScalar = Scalar.fromString(" "); - ColumnVector replaced = input.stringReplace(plusScalar, spaceScalar)) { - return replaced.urlDecode(); - } - } -} diff --git a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java b/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java deleted file mode 100644 index 810d99aec5f..00000000000 --- a/udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.udf.java; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import com.nvidia.spark.RapidsUDF; -import org.apache.spark.sql.api.java.UDF1; - -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; - -/** - * A Java user-defined function (UDF) that URL-encodes strings. - * This class demonstrates how to implement a Java UDF that also - * provides a RAPIDS implementation that can run on the GPU when the query - * is executed with the RAPIDS Accelerator for Apache Spark. - */ -public class URLEncode implements UDF1, RapidsUDF { - /** Row-by-row implementation that executes on the CPU */ - @Override - public String call(String s) { - if (s == null) { - return null; - } - try { - return URLEncoder.encode(s, "utf-8") - .replace("+", "%20") - .replace("*", "%2A") - .replace("%7E", "~"); - } catch (UnsupportedEncodingException e) { - // utf-8 is a builtin, standard encoding, so this should never happen - throw new RuntimeException(e); - } - } - - /** Columnar implementation that runs on the GPU */ - @Override - public ColumnVector evaluateColumnar(ColumnVector... args) { - // The CPU implementation takes a single string argument, so similarly - // there should only be one column argument of type STRING. - if (args.length != 1) { - throw new IllegalArgumentException("Unexpected argument count: " + args.length); - } - ColumnVector input = args[0]; - if (!input.getType().equals(DType.STRING)) { - throw new IllegalArgumentException("Argument type is not a string column: " + - input.getType()); - } - - return input.urlEncode(); - } -}