Skip to content

Commit

Permalink
Add NaNs to Data Generators In Floating-Point Testing [databricks] (#…
Browse files Browse the repository at this point in the history
…9334)

* part_and_order_gens with nans

* running_part_and_oder_gens

* lead_lag_data_gens

* removed no_nans gens

* renamed generator from with_nans_with_decimal* to with_nan_and_decimal*

* added missing gens to init_list

* addressed review comments

* Signing off

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

---------

Signed-off-by: Raza Jafri <rjafri@nvidia.com>
  • Loading branch information
razajafri authored Oct 2, 2023
1 parent 15511b6 commit 3ce1e9e
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 72 deletions.
1 change: 1 addition & 0 deletions integration_tests/src/main/python/array_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
DecimalGen(precision=12, scale=2, nullable=False),
DecimalGen(precision=20, scale=2, nullable=False)]

# This non-nans version is only used for Spark version < 3.1.3
no_neg_zero_all_basic_gens_no_nans = [byte_gen, short_gen, int_gen, long_gen,
# -0.0 cannot work because of -0.0 == 0.0 in cudf for distinct
FloatGen(special_cases=[], no_nans=True),
Expand Down
4 changes: 0 additions & 4 deletions integration_tests/src/main/python/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,12 +1067,8 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False):
# all of the basic types in a single struct
all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)])

all_basic_struct_gen_no_nan = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens_no_nan)])

struct_array_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens)])

struct_array_gen_no_nans = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens_no_nan)])

# Some struct gens, but not all because of nesting
nonempty_struct_gens_sample = [all_basic_struct_gen,
StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]),
Expand Down
85 changes: 21 additions & 64 deletions integration_tests/src/main/python/hash_aggregate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,25 +167,16 @@
_grpkey_floats_with_nulls_and_nans
]

# List of schemas with no NaNs
_init_list_no_nans = [
_longs_with_nulls,
_longs_with_no_nulls,
_grpkey_longs_with_nulls,
_grpkey_dbls_with_nulls,
_grpkey_floats_with_nulls,
_grpkey_strings_with_nulls,
_grpkey_nulls,
_grpkey_strings_with_extra_nulls]

# List of schemas with NaNs included
_init_list_with_nans_and_no_nans = [
_init_list = [
_longs_with_nulls,
_longs_with_no_nulls,
_grpkey_longs_with_nulls,
_grpkey_dbls_with_nulls,
_grpkey_floats_with_nulls,
_grpkey_strings_with_nulls,
_grpkey_strings_with_extra_nulls,
_grpkey_nulls,
_grpkey_floats_with_nulls_and_nans]

# grouping decimals with nulls
Expand All @@ -197,7 +188,7 @@
('b', DecimalGen(nullable=False)),
('c', DecimalGen(nullable=False))]

_init_list_with_nans_and_no_nans_with_decimals = _init_list_with_nans_and_no_nans + [
_init_list_with_decimals = _init_list + [
_decimals_with_nulls, _decimals_with_no_nulls]

# Used to test ANSI-mode fallback
Expand Down Expand Up @@ -303,15 +294,7 @@ def get_params(init_list, marked_params=[]):
('c', _decimal_gen_sum_38_neg10)]


_init_list_no_nans_with_decimal = _init_list_no_nans + [
_grpkey_small_decimals]

_init_list_no_nans_with_decimalbig = _init_list_no_nans + [
_grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals,
_grpkey_short_big_decimals, _grpkey_short_very_big_decimals,
_grpkey_short_very_big_neg_scale_decimals]

_init_list_with_nans_and_no_nans_with_decimalbig = _init_list_with_nans_and_no_nans + [
_init_list_with_decimalbig = _init_list + [
_grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals,
_grpkey_short_big_decimals, _grpkey_short_very_big_decimals,
_grpkey_short_very_big_neg_scale_decimals]
Expand Down Expand Up @@ -378,7 +361,7 @@ def test_computation_in_grpby_columns():
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_grpby_sum(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
Expand Down Expand Up @@ -420,7 +403,7 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals,
@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals,
_grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_grpby_avg(data_gen, conf):
Expand Down Expand Up @@ -451,15 +434,15 @@ def test_hash_avg_nulls_partial_only(data_gen):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
def test_intersectAll(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100)))

@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
def test_exceptAll(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b')))
Expand All @@ -477,7 +460,7 @@ def test_exceptAll(data_gen):
('b', _pivot_gen_128bit),
('c', decimal_gen_128bit)]

_pivot_gens_with_decimals = _init_list_with_nans_and_no_nans + [
_pivot_gens_with_decimals = _init_list + [
_grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals,
_pivot_short_big_decimals, _grpkey_short_very_big_decimals,
_grpkey_short_very_big_neg_scale_decimals]
Expand All @@ -497,20 +480,7 @@ def test_hash_grpby_pivot(data_gen, conf):
@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_grpby_pivot_without_nans(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: gen_df(spark, data_gen, length=100)
.groupby('a')
.pivot('b')
.agg(f.sum('c')),
conf=conf)

@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_multiple_grpby_pivot(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
Expand All @@ -523,22 +493,9 @@ def test_hash_multiple_grpby_pivot(data_gen, conf):
@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_reduction_pivot_without_nans(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: gen_df(spark, data_gen, length=100)
.groupby()
.pivot('b')
.agg(f.sum('c')),
conf=conf)

@approximate_float
@ignore_order(local=True)
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_reduction_pivot_with_nans(data_gen, conf):
def test_hash_reduction_pivot(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: gen_df(spark, data_gen, length=100)
.groupby()
Expand Down Expand Up @@ -943,7 +900,7 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback()
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_multiple_mode_query(data_gen, conf):
print_params(data_gen)
Expand All @@ -965,7 +922,7 @@ def test_hash_multiple_mode_query(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs),
ids=idfn)
def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
Expand All @@ -978,7 +935,7 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf):
local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
Expand All @@ -1001,7 +958,7 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_query_max_with_multiple_distincts(data_gen, conf):
local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
Expand All @@ -1015,7 +972,7 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf):
conf=local_conf)

@ignore_order
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_count_with_filter(data_gen, conf):
assert_gpu_and_cpu_are_equal_collect(
Expand All @@ -1027,7 +984,7 @@ def test_hash_count_with_filter(data_gen, conf):
@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_hash_multiple_filters(data_gen, conf):
assert_gpu_and_cpu_are_equal_sql(
Expand Down Expand Up @@ -1784,7 +1741,7 @@ def do_it(spark):
@ignore_order(local=True)
@approximate_float
@incompat
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans_with_decimals, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list_with_decimals, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
def test_std_variance(data_gen, conf):
local_conf = copy_and_update(conf, {
Expand Down Expand Up @@ -1850,7 +1807,7 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled):
'StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp',
'SortArray', 'Alias', 'Literal', 'Count',
'AggregateExpression', 'ProjectExec')
@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
@pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn)
@pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn)
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/src/main/python/hashing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

_xxhash_fallback_gens = single_level_array_gens + nested_array_gens_sample + [
all_basic_struct_gen,
struct_array_gen_no_nans,
struct_array_gen,
_struct_of_xxhash_gens]
if is_before_spark_320():
_xxhash_fallback_gens += [float_gen, double_gen]
Expand Down
6 changes: 3 additions & 3 deletions integration_tests/src/main/python/window_function_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,15 @@
('a', IntegerGen()),
('b', LongGen(nullable=True))]

part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
part_and_order_gens = [long_gen, DoubleGen(special_cases=[]),
string_gen, boolean_gen, timestamp_gen, DecimalGen(precision=18, scale=1),
DecimalGen(precision=38, scale=1)]

running_part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
running_part_and_order_gens = [long_gen, DoubleGen(special_cases=[]),
string_gen, byte_gen, timestamp_gen, DecimalGen(precision=18, scale=1),
DecimalGen(precision=38, scale=1)]

lead_lag_data_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
lead_lag_data_gens = [long_gen, DoubleGen(special_cases=[]),
boolean_gen, timestamp_gen, string_gen, DecimalGen(precision=18, scale=3),
DecimalGen(precision=38, scale=4),
StructGen(children=[
Expand Down

0 comments on commit 3ce1e9e

Please sign in to comment.