Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NaNs to Data Generators In Floating-Point Testing [databricks] #9334

Merged
merged 9 commits into from
Oct 2, 2023
1 change: 1 addition & 0 deletions integration_tests/src/main/python/array_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
DecimalGen(precision=12, scale=2, nullable=False),
DecimalGen(precision=20, scale=2, nullable=False)]

# This non-nans version is only used for Spark version < 3.1.3
no_neg_zero_all_basic_gens_no_nans = [byte_gen, short_gen, int_gen, long_gen,
# -0.0 cannot work because of -0.0 == 0.0 in cudf for distinct
FloatGen(special_cases=[], no_nans=True),
Expand Down
4 changes: 0 additions & 4 deletions integration_tests/src/main/python/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,12 +1067,8 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False):
# all of the basic types in a single struct
all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)])

all_basic_struct_gen_no_nan = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens_no_nan)])

struct_array_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens)])

struct_array_gen_no_nans = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens_no_nan)])

# Some struct gens, but not all because of nesting
nonempty_struct_gens_sample = [all_basic_struct_gen,
StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]),
Expand Down
85 changes: 21 additions & 64 deletions integration_tests/src/main/python/hash_aggregate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,25 +167,16 @@
_grpkey_floats_with_nulls_and_nans
]

# List of schemas with no NaNs
_init_list_no_nans = [
_longs_with_nulls,
_longs_with_no_nulls,
_grpkey_longs_with_nulls,
_grpkey_dbls_with_nulls,
_grpkey_floats_with_nulls,
_grpkey_strings_with_nulls,
_grpkey_nulls,
_grpkey_strings_with_extra_nulls]

# List of schemas with NaNs included
_init_list_with_nans_and_no_nans = [
_init_list = [
_longs_with_nulls,
_longs_with_no_nulls,
_grpkey_longs_with_nulls,
_grpkey_dbls_with_nulls,
_grpkey_floats_with_nulls,
_grpkey_strings_with_nulls,
_grpkey_strings_with_extra_nulls,
_grpkey_nulls,
_grpkey_floats_with_nulls_and_nans]

# grouping decimals with nulls
Expand All @@ -197,7 +188,7 @@
('b', DecimalGen(nullable=False)),
('c', DecimalGen(nullable=False))]

_init_list_with_nans_and_no_nans_with_decimals = _init_list_with_nans_and_no_nans + [
_init_list_with_decimals = _init_list + [
_decimals_with_nulls, _decimals_with_no_nulls]

# Used to test ANSI-mode fallback
Expand Down Expand Up @@ -303,15 +294,7 @@ def get_params(init_list, marked_params=[]):
('c', _decimal_gen_sum_38_neg10)]


_init_list_no_nans_with_decimal = _init_list_no_nans + [
_grpkey_small_decimals]

_init_list_no_nans_with_decimalbig = _init_list_no_nans + [
_grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals,
_grpkey_short_big_decimals, _grpkey_short_very_big_decimals,
_grpkey_short_very_big_neg_scale_decimals]

_init_list_with_nans_and_no_nans_with_decimalbig = _init_list_with_nans_and_no_nans + [
_init_list_with_decimalbig = _init_list + [
_grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals,
_grpkey_short_big_decimals, _grpkey_short_very_big_decimals,
_grpkey_short_very_big_neg_scale_decimals]
Expand Down Expand Up @@ -378,7 +361,7 @@ def test_computation_in_grpby_columns():
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_grpby_sum(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
Expand Down Expand Up @@ -420,7 +403,7 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals,
@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals,
_grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_grpby_avg(data_gen, conf):
Expand Down Expand Up @@ -451,15 +434,15 @@ def test_hash_avg_nulls_partial_only(data_gen):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
def test_intersectAll(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100)))

@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
def test_exceptAll(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b')))
Expand All @@ -477,7 +460,7 @@ def test_exceptAll(data_gen):
('b', _pivot_gen_128bit),
('c', decimal_gen_128bit)]

_pivot_gens_with_decimals = _init_list_with_nans_and_no_nans + [
_pivot_gens_with_decimals = _init_list + [
_grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals,
_pivot_short_big_decimals, _grpkey_short_very_big_decimals,
_grpkey_short_very_big_neg_scale_decimals]
Expand All @@ -497,20 +480,7 @@ def test_hash_grpby_pivot(data_gen, conf):
@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_grpby_pivot_without_nans(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: gen_df(spark, data_gen, length=100)
.groupby('a')
.pivot('b')
.agg(f.sum('c')),
conf=conf)

@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_multiple_grpby_pivot(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
Expand All @@ -523,22 +493,9 @@ def test_hash_multiple_grpby_pivot(data_gen, conf):
@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_reduction_pivot_without_nans(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: gen_df(spark, data_gen, length=100)
.groupby()
.pivot('b')
.agg(f.sum('c')),
conf=conf)

@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_reduction_pivot_with_nans(data_gen, conf):
def test_hash_reduction_pivot(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: gen_df(spark, data_gen, length=100)
.groupby()
Expand Down Expand Up @@ -943,7 +900,7 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback()
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_multiple_mode_query(data_gen, conf):
print_params(data_gen)
Expand All @@ -965,7 +922,7 @@ def test_hash_multiple_mode_query(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs),
ids=idfn)
def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
Expand All @@ -978,7 +935,7 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf):
local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
Expand All @@ -1001,7 +958,7 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_query_max_with_multiple_distincts(data_gen, conf):
local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
Expand All @@ -1015,7 +972,7 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf):
conf=local_conf)

@ignore_order
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_count_with_filter(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
Expand All @@ -1027,7 +984,7 @@ def test_hash_count_with_filter(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_multiple_filters(data_gen, conf):
assert_gpu_and_cpu_are_equal_sql(
Expand Down Expand Up @@ -1784,7 +1741,7 @@ def do_it(spark):
@ignore_order(local=True)
@approximate_float
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans_with_decimals, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimals, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_std_variance(data_gen, conf):
local_conf = copy_and_update(conf, {
Expand Down Expand Up @@ -1850,7 +1807,7 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled):
'StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp',
'SortArray', 'Alias', 'Literal', 'Count',
'AggregateExpression', 'ProjectExec')
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
@pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn)
@pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn)
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/src/main/python/hashing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

_xxhash_fallback_gens = single_level_array_gens + nested_array_gens_sample + [
all_basic_struct_gen,
struct_array_gen_no_nans,
struct_array_gen,
_struct_of_xxhash_gens]
if is_before_spark_320():
_xxhash_fallback_gens += [float_gen, double_gen]
Expand Down
6 changes: 3 additions & 3 deletions integration_tests/src/main/python/window_function_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,15 @@
('a', IntegerGen()),
('b', LongGen(nullable=True))]

part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
part_and_order_gens = [long_gen, DoubleGen(special_cases=[]),
jlowe marked this conversation as resolved.
Show resolved Hide resolved
string_gen, boolean_gen, timestamp_gen, DecimalGen(precision=18, scale=1),
DecimalGen(precision=38, scale=1)]

running_part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
running_part_and_order_gens = [long_gen, DoubleGen(special_cases=[]),
string_gen, byte_gen, timestamp_gen, DecimalGen(precision=18, scale=1),
DecimalGen(precision=38, scale=1)]

lead_lag_data_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
lead_lag_data_gens = [long_gen, DoubleGen(special_cases=[]),
boolean_gen, timestamp_gen, string_gen, DecimalGen(precision=18, scale=3),
DecimalGen(precision=38, scale=4),
StructGen(children=[
Expand Down