Skip to content

Commit

Permalink
Move main part of udf-examples to the external repository spark-rapid…
Browse files Browse the repository at this point in the history
…s-examples

Signed-off-by: Chong Gao <res_life@163.com>
  • Loading branch information
Chong Gao committed Jan 24, 2022
1 parent 4747182 commit a2ca9e0
Show file tree
Hide file tree
Showing 25 changed files with 8 additions and 1,569 deletions.
6 changes: 0 additions & 6 deletions docs/additional-functionality/rapids-udfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,6 @@ decodes URL-encoded strings using the
- [URLEncode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java)
URL-encodes strings using the
[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
- [CosineSimilarity](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java)
computes the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
between two float vectors using [native code](../../udf-examples/src/main/cpp/src)

### Hive UDF Examples

Expand All @@ -168,9 +165,6 @@ to decode URL-encoded strings
implements a Hive generic UDF using the
[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
to URL-encode strings
- [StringWordCount](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java)
implements a Hive simple UDF using
[native code](../../udf-examples/src/main/cpp/src) to count words in strings


## GPU Support for Pandas UDF
Expand Down
6 changes: 1 addition & 5 deletions integration_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,10 +35,6 @@ def pytest_addoption(parser):
parser.addoption(
"--cudf_udf", action='store_true', default=False, help="if true enable cudf_udf test"
)
parser.addoption(
"--rapids_udf_example_native", action='store_true', default=False,
help="if true enable tests for RAPIDS UDF examples with native code"
)
parser.addoption(
"--test_type", action='store', default="developer",
help="the type of tests that are being run to help check all the correct tests are run - developer, pre-commit, or nightly"
Expand Down
3 changes: 1 addition & 2 deletions integration_tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; Copyright (c) 2020-2021, NVIDIA CORPORATION.
; Copyright (c) 2020-2022, NVIDIA CORPORATION.
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
Expand All @@ -22,7 +22,6 @@ markers =
limit(num_rows): Limit the number of rows that will be check in a result
qarun: Mark qa test
cudf_udf: Mark udf cudf test
rapids_udf_example_native: test UDFs that require custom cuda compilation
validate_execs_in_gpu_plan([execs]): Exec class names to validate they exist in the GPU plan.
shuffle_test: Mark to include test in the RAPIDS Shuffle Manager
premerge_ci_1: Mark test that will run in first k8s pod in case of parallel build premerge job
Expand Down
9 changes: 1 addition & 8 deletions integration_tests/src/main/python/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -321,10 +321,3 @@ def enable_cudf_udf(request):
if not enable_udf_cudf:
# cudf_udf tests are not required for any test runs
pytest.skip("cudf_udf not configured to run")

@pytest.fixture(scope="session")
def enable_rapids_udf_example_native(request):
native_enabled = request.config.getoption("rapids_udf_example_native")
if not native_enabled:
# udf_example_native tests are not required for any test runs
pytest.skip("rapids_udf_example_native is not configured to run")
3 changes: 1 addition & 2 deletions integration_tests/src/main/python/marks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -23,7 +23,6 @@
limit = pytest.mark.limit
qarun = pytest.mark.qarun
cudf_udf = pytest.mark.cudf_udf
rapids_udf_example_native = pytest.mark.rapids_udf_example_native
shuffle_test = pytest.mark.shuffle_test
nightly_gpu_mem_consuming_case = pytest.mark.nightly_gpu_mem_consuming_case
nightly_host_mem_consuming_case = pytest.mark.nightly_host_mem_consuming_case
58 changes: 1 addition & 57 deletions integration_tests/src/main/python/rapids_udf_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,7 +16,6 @@

from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql
from data_gen import *
from marks import rapids_udf_example_native
from spark_session import with_spark_session
from pyspark.sql.utils import AnalysisException
from conftest import skip_unless_precommit_tests
Expand Down Expand Up @@ -58,26 +57,6 @@ def evalfn(spark):
"hive_generic_udf_test_table",
"SELECT urlencode(s) FROM hive_generic_udf_test_table")

def evalfn_decimal(spark):
load_hive_udf_or_skip_test(spark, "fraction", "com.nvidia.spark.rapids.udf.hive.DecimalFraction")
return gen_df(spark, [["dec", DecimalGen(38, 18)]])
assert_gpu_and_cpu_are_equal_sql(
evalfn_decimal,
"hive_generic_udf_test_table",
"SELECT fraction(dec) FROM hive_generic_udf_test_table")

@rapids_udf_example_native
def test_hive_simple_udf_native(enable_rapids_udf_example_native):
with_spark_session(skip_if_no_hive)
data_gens = [["s", StringGen('.{0,30}')]]
def evalfn(spark):
load_hive_udf_or_skip_test(spark, "wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount")
return gen_df(spark, data_gens)
assert_gpu_and_cpu_are_equal_sql(
evalfn,
"hive_native_udf_test_table",
"SELECT wordcount(s) FROM hive_native_udf_test_table")

def load_java_udf_or_skip_test(spark, udfname, udfclass, udf_return_type=None):
drop_udf(spark, udfname)
try:
Expand All @@ -96,38 +75,3 @@ def evalfn(spark):
load_java_udf_or_skip_test(spark, 'urlencode', 'com.nvidia.spark.rapids.udf.java.URLEncode')
return unary_op_df(spark, StringGen('.{0,30}')).selectExpr("urlencode(a)")
assert_gpu_and_cpu_are_equal_collect(evalfn)

def test_java_decimal_fraction():
def evalfn(spark):
from pyspark.sql.types import DecimalType
load_java_udf_or_skip_test(spark, 'fraction',
'com.nvidia.spark.rapids.udf.java.DecimalFraction')
load_java_udf_or_skip_test(spark, 'fraction_dec64_s10',
'com.nvidia.spark.rapids.udf.java.DecimalFraction',
DecimalType(18, 10))
load_java_udf_or_skip_test(spark, 'fraction_dec32_s3',
'com.nvidia.spark.rapids.udf.java.DecimalFraction',
DecimalType(8, 3))
return three_col_df(spark, DecimalGen(38, 18), DecimalGen(18, 10), DecimalGen(8, 3)
).selectExpr("fraction(a)", "fraction_dec64_s10(b)", "fraction_dec32_s3(c)")
assert_gpu_and_cpu_are_equal_collect(evalfn)

@rapids_udf_example_native
def test_java_cosine_similarity_reasonable_range(enable_rapids_udf_example_native):
def evalfn(spark):
class RangeFloatGen(FloatGen):
def start(self, rand):
self._start(rand, lambda: rand.uniform(-1000.0, 1000.0))
load_java_udf_or_skip_test(spark, "cosine_similarity", "com.nvidia.spark.rapids.udf.java.CosineSimilarity")
arraygen = ArrayGen(RangeFloatGen(nullable=False, no_nans=True, special_cases=[]), min_length=8, max_length=8)
df = binary_op_df(spark, arraygen)
return df.selectExpr("cosine_similarity(a, b)")
assert_gpu_and_cpu_are_equal_collect(evalfn)

@rapids_udf_example_native
def test_java_cosine_similarity_with_nans(enable_rapids_udf_example_native):
def evalfn(spark):
load_java_udf_or_skip_test(spark, "cosine_similarity", "com.nvidia.spark.rapids.udf.java.CosineSimilarity")
arraygen = ArrayGen(FloatGen(nullable=False), min_length=8, max_length=8)
return binary_op_df(spark, arraygen).selectExpr("cosine_similarity(a, b)")
assert_gpu_and_cpu_are_equal_collect(evalfn)
15 changes: 1 addition & 14 deletions udf-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,4 @@ user-defined functions. See the
on how RAPIDS accelerated UDFs work and guidelines for creating them.

## Building the Native Code Examples

Some of the UDF examples use native code in their implementation.
Building the native code requires a libcudf build environment, so these
examples do not build by default. The `udf-native-examples` Maven profile
can be used to include the native UDF examples in the build, i.e.: specify
`-Pudf-native-examples` on the `mvn` command-line.

## Creating a libcudf Build Environment

The `Dockerfile` in this directory can be used to setup a Docker image that
provides a libcudf build environment. This repository will either need to be
cloned or mounted into a container using that Docker image.
The `Dockerfile` contains build arguments to control the Linux version,
CUDA version, and other settings. See the top of the `Dockerfile` for details.
Please refer to [spark-rapids-examples](https://github.com/NVIDIA/spark-rapids-examples)
87 changes: 2 additions & 85 deletions udf-examples/pom.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2020-2021, NVIDIA CORPORATION.
Copyright (c) 2020-2022, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,13 +32,6 @@
<version>22.02.0-SNAPSHOT</version>

<properties>
<udf.native.build.path>${project.build.directory}/cpp-build</udf.native.build.path>
<BUILD_UDF_BENCHMARKS>OFF</BUILD_UDF_BENCHMARKS>
<CMAKE_CXX_FLAGS/>
<GPU_ARCHS>ALL</GPU_ARCHS>
<PER_THREAD_DEFAULT_STREAM>ON</PER_THREAD_DEFAULT_STREAM>
<CPP_PARALLEL_LEVEL>10</CPP_PARALLEL_LEVEL>
<CUDF_ENABLE_ARROW_S3>OFF</CUDF_ENABLE_ARROW_S3>
<target.classifier/>
</properties>

Expand All @@ -58,14 +51,8 @@
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<!--
This should depend on rapids-4-spark_${scala.binary.version} instead, but that dependency
only exists after the package phase. External projects should depend on
rapids-4-spark_${scala.binary.version}.
-->
<artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
<artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
<scope>provided</scope>
</dependency>
</dependencies>
Expand Down Expand Up @@ -184,75 +171,5 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>udf-native-examples</id>
<build>
<resources>
<resource>
<directory>${project.build.directory}/native-deps/</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<id>cmake</id>
<phase>validate</phase>
<configuration>
<target>
<mkdir dir="${udf.native.build.path}"/>
<exec dir="${udf.native.build.path}"
failonerror="true"
executable="cmake">
<arg value="${basedir}/src/main/cpp"/>
<arg value="-DBUILD_UDF_BENCHMARKS=${BUILD_UDF_BENCHMARKS}"/>
<arg value="-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"/>
<arg value="-DGPU_ARCHS=${GPU_ARCHS}"/>
<arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}"/>
<arg value="-DCUDF_ENABLE_ARROW_S3=${CUDF_ENABLE_ARROW_S3}"/>
</exec>
<exec failonerror="true"
executable="cmake">
<arg value="--build"/>
<arg value="${udf.native.build.path}"/>
<arg value="-j${CPP_PARALLEL_LEVEL}"/>
<arg value="-v"/>
</exec>
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-native-libs</id>
<phase>validate</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<overwrite>true</overwrite>
<outputDirectory>${project.build.directory}/native-deps/${os.arch}/${os.name}</outputDirectory>
<resources>
<resource>
<directory>${udf.native.build.path}</directory>
<includes>
<include>libudfexamplesjni.so</include>
</includes>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>
Loading

0 comments on commit a2ca9e0

Please sign in to comment.