Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrap scalar generation into spark session in integration test #9405

Merged
merged 7 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions integration_tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,12 @@ The marks you care about are all in marks.py
For the most part you can ignore this file. It provides the underlying Spark session to operations that need it, but most tests should interact with
it through `asserts.py`.

All data generation and Spark function calls should occur within a Spark session. Typically
this is done by passing a lambda to functions in `asserts.py` such as
`assert_gpu_and_cpu_are_equal_collect`. However, for scalar generation like `gen_scalars`, you
may need to put it in a `with_cpu_session`. It is because negative scale decimals can have
problems when calling `f.lit` from outside of `with_spark_session`.

## Guidelines for Testing

When support for a new operator is added to the Rapids Accelerator for Spark, or when an existing operator is extended
Expand Down
6 changes: 4 additions & 2 deletions integration_tests/src/main/python/arithmetic_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,8 @@ def test_columnar_pow(data_gen):
@pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn)
def test_least(data_gen):
num_cols = 20
s1 = gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen))
s1 = with_cpu_session(
lambda spark: gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen)))
# we want lots of nulls
gen = StructGen([('_c' + str(x), data_gen.copy_special_case(None, weight=100.0))
for x in range(0, num_cols)], nullable=False)
Expand All @@ -956,7 +957,8 @@ def test_least(data_gen):
@pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn)
def test_greatest(data_gen):
num_cols = 20
s1 = gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen))
s1 = with_cpu_session(
lambda spark: gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen)))
# we want lots of nulls
gen = StructGen([('_c' + str(x), data_gen.copy_special_case(None, weight=100.0))
for x in range(0, num_cols)], nullable=False)
Expand Down
5 changes: 3 additions & 2 deletions integration_tests/src/main/python/array_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def test_array_item_ansi_not_fail_all_null_data():
StructGen([['child0', StructGen([['child01', IntegerGen()]])], ['child1', string_gen], ['child2', float_gen]], nullable=False),
StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]], nullable=False)], ids=idfn)
def test_make_array(data_gen):
(s1, s2) = gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).selectExpr(
'array(null)',
Expand Down Expand Up @@ -213,7 +214,7 @@ def test_orderby_array_of_structs(data_gen):
string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn)
def test_array_contains(data_gen):
arr_gen = ArrayGen(data_gen)
literal = gen_scalar(data_gen, force_no_nulls=True)
literal = with_cpu_session(lambda spark: gen_scalar(data_gen, force_no_nulls=True))

def get_input(spark):
return two_col_df(spark, arr_gen, data_gen)
Expand Down
16 changes: 8 additions & 8 deletions integration_tests/src/main/python/ast_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -75,7 +75,7 @@ def test_literal(spark_tmp_path, data_gen):
# Write data to Parquet so Spark generates a plan using just the count of the data.
data_path = spark_tmp_path + '/AST_TEST_DATA'
with_cpu_session(lambda spark: gen_df(spark, [("a", IntegerGen())]).write.parquet(data_path))
scalar = gen_scalar(data_gen, force_no_nulls=True)
scalar = with_cpu_session(lambda spark: gen_scalar(data_gen, force_no_nulls=True))
assert_gpu_ast(is_supported=True,
func=lambda spark: spark.read.parquet(data_path).select(scalar))

Expand Down Expand Up @@ -234,7 +234,7 @@ def test_expm1(data_descr):

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
def test_eq(data_descr):
(s1, s2) = gen_scalars(data_descr[0], 2)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
lambda df: df.select(
f.col('a') == s1,
Expand All @@ -243,7 +243,7 @@ def test_eq(data_descr):

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
def test_ne(data_descr):
(s1, s2) = gen_scalars(data_descr[0], 2)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
lambda df: df.select(
f.col('a') != s1,
Expand All @@ -252,7 +252,7 @@ def test_ne(data_descr):

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
def test_lt(data_descr):
(s1, s2) = gen_scalars(data_descr[0], 2)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
lambda df: df.select(
f.col('a') < s1,
Expand All @@ -261,7 +261,7 @@ def test_lt(data_descr):

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
def test_lte(data_descr):
(s1, s2) = gen_scalars(data_descr[0], 2)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
lambda df: df.select(
f.col('a') <= s1,
Expand All @@ -270,7 +270,7 @@ def test_lte(data_descr):

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
def test_gt(data_descr):
(s1, s2) = gen_scalars(data_descr[0], 2)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
lambda df: df.select(
f.col('a') > s1,
Expand All @@ -279,7 +279,7 @@ def test_gt(data_descr):

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
def test_gte(data_descr):
(s1, s2) = gen_scalars(data_descr[0], 2)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
lambda df: df.select(
f.col('a') >= s1,
Expand Down
42 changes: 28 additions & 14 deletions integration_tests/src/main/python/cmp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn)
def test_eq(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -35,7 +36,8 @@ def test_eq(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_eq_for_interval():
def test_func(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -53,7 +55,8 @@ def test_func(data_gen):

@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn)
def test_eq_ns(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -66,7 +69,8 @@ def test_eq_ns(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_eq_ns_for_interval():
data_gen = DayTimeIntervalGen()
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -78,7 +82,8 @@ def test_eq_ns_for_interval():

@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn)
def test_ne(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -91,7 +96,8 @@ def test_ne(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_ne_for_interval():
def test_func(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -109,7 +115,8 @@ def test_func(data_gen):

@pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn)
def test_lt(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -122,7 +129,8 @@ def test_lt(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_lt_for_interval():
def test_func(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -140,7 +148,8 @@ def test_func(data_gen):

@pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn)
def test_lte(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -153,7 +162,8 @@ def test_lte(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_lte_for_interval():
def test_func(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -171,7 +181,8 @@ def test_func(data_gen):

@pytest.mark.parametrize('data_gen', orderable_gens, ids=idfn)
def test_gt(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -184,7 +195,8 @@ def test_gt(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_gt_interval():
def test_func(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -202,7 +214,8 @@ def test_func(data_gen):

@pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn)
def test_gte(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand All @@ -215,7 +228,8 @@ def test_gte(data_gen):
@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
def test_gte_for_interval():
def test_func(data_gen):
(s1, s2) = gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
data_type = data_gen.data_type
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).select(
Expand Down
16 changes: 8 additions & 8 deletions integration_tests/src/main/python/collection_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def test_concat_list(data_gen):
@pytest.mark.parametrize('dg', non_nested_array_gens, ids=idfn)
def test_concat_double_list_with_lit(dg):
data_gen = ArrayGen(dg, max_length=2)
array_lit = gen_scalar(data_gen)
array_lit2 = gen_scalar(data_gen)
array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen))
array_lit2 = with_cpu_session(lambda spark: gen_scalar(data_gen))
assert_gpu_and_cpu_are_equal_collect(
lambda spark: binary_op_df(spark, data_gen).select(
f.concat(f.col('a'),
Expand All @@ -67,8 +67,8 @@ def test_concat_double_list_with_lit(dg):

@pytest.mark.parametrize('data_gen', non_nested_array_gens, ids=idfn)
def test_concat_list_with_lit(data_gen):
lit_col1 = f.lit(gen_scalar(data_gen)).cast(data_gen.data_type)
lit_col2 = f.lit(gen_scalar(data_gen)).cast(data_gen.data_type)
lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type)
lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type)

assert_gpu_and_cpu_are_equal_collect(
lambda spark: binary_op_df(spark, data_gen).select(
Expand All @@ -78,7 +78,7 @@ def test_concat_list_with_lit(data_gen):

def test_concat_string():
gen = mk_str_gen('.{0,5}')
(s1, s2) = gen_scalars(gen, 2, force_no_nulls=True)
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(gen, 2, force_no_nulls=True))
assert_gpu_and_cpu_are_equal_collect(
lambda spark: binary_op_df(spark, gen).select(
f.concat(),
Expand Down Expand Up @@ -106,8 +106,8 @@ def test_map_concat(data_gen):

@pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn)
def test_map_concat_with_lit(data_gen):
lit_col1 = f.lit(gen_scalar(data_gen)).cast(data_gen.data_type)
lit_col2 = f.lit(gen_scalar(data_gen)).cast(data_gen.data_type)
lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type)
lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type)
assert_gpu_and_cpu_are_equal_collect(
lambda spark: binary_op_df(spark, data_gen).select(
f.map_concat(f.col('a'), f.col('b'), lit_col1),
Expand Down Expand Up @@ -150,7 +150,7 @@ def test_sort_array(data_gen):

@pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn)
def test_sort_array_lit(data_gen):
array_lit = gen_scalar(data_gen)
array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen))
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, data_gen, length=10).select(
f.sort_array(f.lit(array_lit), True),
Expand Down
Loading