Add NaNs to Data Generators In Floating-Point Testing [databricks] (#…

…9334) * part_and_order_gens with nans * running_part_and_oder_gens * lead_lag_data_gens * removed no_nans gens * renamed generator from with_nans_with_decimal* to with_nan_and_decimal* * added missing gens to init_list * addressed review comments * Signing off Signed-off-by: Raza Jafri <rjafri@nvidia.com> --------- Signed-off-by: Raza Jafri <rjafri@nvidia.com>
NVIDIA · Oct 2, 2023 · 3ce1e9e · 3ce1e9e
1 parent 15511b6
commit 3ce1e9e
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 72 deletions.
diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py
@@ -82,6 +82,7 @@
         DecimalGen(precision=12, scale=2, nullable=False),
         DecimalGen(precision=20, scale=2, nullable=False)]
 
+# This non-nans version is only used for Spark version < 3.1.3
 no_neg_zero_all_basic_gens_no_nans = [byte_gen, short_gen, int_gen, long_gen,
         # -0.0 cannot work because of -0.0 == 0.0 in cudf for distinct
         FloatGen(special_cases=[], no_nans=True), 

diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py
@@ -1067,12 +1067,8 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False):
 # all of the basic types in a single struct
 all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)])
 
-all_basic_struct_gen_no_nan = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens_no_nan)])
-
 struct_array_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens)])
 
-struct_array_gen_no_nans = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens_no_nan)])
-
 # Some struct gens, but not all because of nesting
 nonempty_struct_gens_sample = [all_basic_struct_gen,
         StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]),

diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -167,25 +167,16 @@
     _grpkey_floats_with_nulls_and_nans
 ]
 
-# List of schemas with no NaNs
-_init_list_no_nans = [
-    _longs_with_nulls,
-    _longs_with_no_nulls,
-    _grpkey_longs_with_nulls,
-    _grpkey_dbls_with_nulls,
-    _grpkey_floats_with_nulls,
-    _grpkey_strings_with_nulls,
-    _grpkey_nulls,
-    _grpkey_strings_with_extra_nulls]
-
 # List of schemas with NaNs included
-_init_list_with_nans_and_no_nans = [
+_init_list = [
     _longs_with_nulls,
     _longs_with_no_nulls,
     _grpkey_longs_with_nulls,
     _grpkey_dbls_with_nulls,
     _grpkey_floats_with_nulls,
     _grpkey_strings_with_nulls,
+    _grpkey_strings_with_extra_nulls,
+    _grpkey_nulls,
     _grpkey_floats_with_nulls_and_nans]
 
 # grouping decimals with nulls
@@ -197,7 +188,7 @@
     ('b', DecimalGen(nullable=False)),
     ('c', DecimalGen(nullable=False))]
 
-_init_list_with_nans_and_no_nans_with_decimals = _init_list_with_nans_and_no_nans + [
+_init_list_with_decimals = _init_list + [
     _decimals_with_nulls, _decimals_with_no_nulls]
 
 # Used to test ANSI-mode fallback
@@ -303,15 +294,7 @@ def get_params(init_list, marked_params=[]):
     ('c', _decimal_gen_sum_38_neg10)]
 
 
-_init_list_no_nans_with_decimal = _init_list_no_nans + [
-    _grpkey_small_decimals]
-
-_init_list_no_nans_with_decimalbig = _init_list_no_nans + [
-    _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals,
-    _grpkey_short_big_decimals, _grpkey_short_very_big_decimals,
-    _grpkey_short_very_big_neg_scale_decimals]
-
-_init_list_with_nans_and_no_nans_with_decimalbig = _init_list_with_nans_and_no_nans + [
+_init_list_with_decimalbig = _init_list + [
     _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals,
     _grpkey_short_big_decimals, _grpkey_short_very_big_decimals,
     _grpkey_short_very_big_neg_scale_decimals]
@@ -378,7 +361,7 @@ def test_computation_in_grpby_columns():
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_grpby_sum(data_gen, conf):
     assert_gpu_and_cpu_are_equal_collect(
@@ -420,7 +403,7 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals,
+@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals,
     _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_grpby_avg(data_gen, conf):
@@ -451,15 +434,15 @@ def test_hash_avg_nulls_partial_only(data_gen):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
 def test_intersectAll(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100)))
 
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn)
 def test_exceptAll(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b')))
@@ -477,7 +460,7 @@ def test_exceptAll(data_gen):
     ('b', _pivot_gen_128bit),
     ('c', decimal_gen_128bit)]
 
-_pivot_gens_with_decimals = _init_list_with_nans_and_no_nans + [
+_pivot_gens_with_decimals = _init_list + [
     _grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals,
     _pivot_short_big_decimals, _grpkey_short_very_big_decimals,
     _grpkey_short_very_big_neg_scale_decimals]
@@ -497,20 +480,7 @@ def test_hash_grpby_pivot(data_gen, conf):
 @approximate_float
 @ignore_order(local=True)
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
-@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
-def test_hash_grpby_pivot_without_nans(data_gen, conf):
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: gen_df(spark, data_gen, length=100)
-            .groupby('a')
-            .pivot('b')
-            .agg(f.sum('c')),
-        conf=conf)
-
-@approximate_float
-@ignore_order(local=True)
-@incompat
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_multiple_grpby_pivot(data_gen, conf):
     assert_gpu_and_cpu_are_equal_collect(
@@ -523,22 +493,9 @@ def test_hash_multiple_grpby_pivot(data_gen, conf):
 @approximate_float
 @ignore_order(local=True)
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
-@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
-def test_hash_reduction_pivot_without_nans(data_gen, conf):
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: gen_df(spark, data_gen, length=100)
-            .groupby()
-            .pivot('b')
-            .agg(f.sum('c')),
-        conf=conf)
-
-@approximate_float
-@ignore_order(local=True)
-@incompat
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
-def test_hash_reduction_pivot_with_nans(data_gen, conf):
+def test_hash_reduction_pivot(data_gen, conf):
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: gen_df(spark, data_gen, length=100)
             .groupby()
@@ -943,7 +900,7 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback()
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_multiple_mode_query(data_gen, conf):
     print_params(data_gen)
@@ -965,7 +922,7 @@ def test_hash_multiple_mode_query(data_gen, conf):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs),
     ids=idfn)
 def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
@@ -978,7 +935,7 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf):
     local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
@@ -1001,7 +958,7 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_query_max_with_multiple_distincts(data_gen, conf):
     local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
@@ -1015,7 +972,7 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf):
         conf=local_conf)
 
 @ignore_order
-@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_count_with_filter(data_gen, conf):
     assert_gpu_and_cpu_are_equal_collect(
@@ -1027,7 +984,7 @@ def test_hash_count_with_filter(data_gen, conf):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_no_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_multiple_filters(data_gen, conf):
     assert_gpu_and_cpu_are_equal_sql(
@@ -1784,7 +1741,7 @@ def do_it(spark):
 @ignore_order(local=True)
 @approximate_float
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans_with_decimals, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list_with_decimals, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_std_variance(data_gen, conf):
     local_conf = copy_and_update(conf, {
@@ -1850,7 +1807,7 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled):
                'StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp',
                'SortArray', 'Alias', 'Literal', 'Count',
                'AggregateExpression', 'ProjectExec')
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn)
 @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn)

diff --git a/integration_tests/src/main/python/hashing_test.py b/integration_tests/src/main/python/hashing_test.py
@@ -39,7 +39,7 @@
 
 _xxhash_fallback_gens = single_level_array_gens + nested_array_gens_sample + [
     all_basic_struct_gen,
-    struct_array_gen_no_nans,
+    struct_array_gen,
     _struct_of_xxhash_gens]
 if is_before_spark_320():
     _xxhash_fallback_gens += [float_gen, double_gen]

diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
@@ -134,15 +134,15 @@
     ('a', IntegerGen()),
     ('b', LongGen(nullable=True))]
 
-part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
+part_and_order_gens = [long_gen, DoubleGen(special_cases=[]),
         string_gen, boolean_gen, timestamp_gen, DecimalGen(precision=18, scale=1),
         DecimalGen(precision=38, scale=1)]
 
-running_part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
+running_part_and_order_gens = [long_gen, DoubleGen(special_cases=[]),
         string_gen, byte_gen, timestamp_gen, DecimalGen(precision=18, scale=1),
         DecimalGen(precision=38, scale=1)]
 
-lead_lag_data_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]),
+lead_lag_data_gens = [long_gen, DoubleGen(special_cases=[]),
         boolean_gen, timestamp_gen, string_gen, DecimalGen(precision=18, scale=3),
         DecimalGen(precision=38, scale=4),
         StructGen(children=[