diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 6cbe4382338..99b68ccfba1 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -82,6 +82,7 @@ DecimalGen(precision=12, scale=2, nullable=False), DecimalGen(precision=20, scale=2, nullable=False)] +# This non-nans version is only used for Spark version < 3.1.3 no_neg_zero_all_basic_gens_no_nans = [byte_gen, short_gen, int_gen, long_gen, # -0.0 cannot work because of -0.0 == 0.0 in cudf for distinct FloatGen(special_cases=[], no_nans=True), diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 9f549adfa46..2a9d7e5e6f0 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -1067,12 +1067,8 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False): # all of the basic types in a single struct all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)]) -all_basic_struct_gen_no_nan = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens_no_nan)]) - struct_array_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens)]) -struct_array_gen_no_nans = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens_no_nan)]) - # Some struct gens, but not all because of nesting nonempty_struct_gens_sample = [all_basic_struct_gen, StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 288cf3ebc07..4f58278360c 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -167,25 +167,16 @@ _grpkey_floats_with_nulls_and_nans ] -# List of schemas with no NaNs -_init_list_no_nans = [ - _longs_with_nulls, - _longs_with_no_nulls, - _grpkey_longs_with_nulls, - _grpkey_dbls_with_nulls, - _grpkey_floats_with_nulls, - _grpkey_strings_with_nulls, - _grpkey_nulls, - _grpkey_strings_with_extra_nulls] - # List of schemas with NaNs included -_init_list_with_nans_and_no_nans = [ +_init_list = [ _longs_with_nulls, _longs_with_no_nulls, _grpkey_longs_with_nulls, _grpkey_dbls_with_nulls, _grpkey_floats_with_nulls, _grpkey_strings_with_nulls, + _grpkey_strings_with_extra_nulls, + _grpkey_nulls, _grpkey_floats_with_nulls_and_nans] # grouping decimals with nulls @@ -197,7 +188,7 @@ ('b', DecimalGen(nullable=False)), ('c', DecimalGen(nullable=False))] -_init_list_with_nans_and_no_nans_with_decimals = _init_list_with_nans_and_no_nans + [ +_init_list_with_decimals = _init_list + [ _decimals_with_nulls, _decimals_with_no_nulls] # Used to test ANSI-mode fallback @@ -303,15 +294,7 @@ def get_params(init_list, marked_params=[]): ('c', _decimal_gen_sum_38_neg10)] -_init_list_no_nans_with_decimal = _init_list_no_nans + [ - _grpkey_small_decimals] - -_init_list_no_nans_with_decimalbig = _init_list_no_nans + [ - _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals, - _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, - _grpkey_short_very_big_neg_scale_decimals] - -_init_list_with_nans_and_no_nans_with_decimalbig = _init_list_with_nans_and_no_nans + [ +_init_list_with_decimalbig = _init_list + [ _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_very_big_neg_scale_decimals] @@ -378,7 +361,7 @@ def test_computation_in_grpby_columns(): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_sum(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -420,7 +403,7 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals, +@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_avg(data_gen, conf): @@ -451,7 +434,7 @@ def test_hash_avg_nulls_partial_only(data_gen): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) def test_intersectAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100))) @@ -459,7 +442,7 @@ def test_intersectAll(data_gen): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) def test_exceptAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b'))) @@ -477,7 +460,7 @@ def test_exceptAll(data_gen): ('b', _pivot_gen_128bit), ('c', decimal_gen_128bit)] -_pivot_gens_with_decimals = _init_list_with_nans_and_no_nans + [ +_pivot_gens_with_decimals = _init_list + [ _grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals, _pivot_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_very_big_neg_scale_decimals] @@ -497,20 +480,7 @@ def test_hash_grpby_pivot(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) -@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_pivot_without_nans(data_gen, conf): - assert_gpu_and_cpu_are_equal_collect( - lambda spark: gen_df(spark, data_gen, length=100) - .groupby('a') - .pivot('b') - .agg(f.sum('c')), - conf=conf) - -@approximate_float -@ignore_order(local=True) -@incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_grpby_pivot(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -523,22 +493,9 @@ def test_hash_multiple_grpby_pivot(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) -@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_pivot_without_nans(data_gen, conf): - assert_gpu_and_cpu_are_equal_collect( - lambda spark: gen_df(spark, data_gen, length=100) - .groupby() - .pivot('b') - .agg(f.sum('c')), - conf=conf) - -@approximate_float -@ignore_order(local=True) -@incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_pivot_with_nans(data_gen, conf): +def test_hash_reduction_pivot(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby() @@ -943,7 +900,7 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback() @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query(data_gen, conf): print_params(data_gen) @@ -965,7 +922,7 @@ def test_hash_multiple_mode_query(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @@ -978,7 +935,7 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) @@ -1001,7 +958,7 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_query_max_with_multiple_distincts(data_gen, conf): local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) @@ -1015,7 +972,7 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf): conf=local_conf) @ignore_order -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_count_with_filter(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -1027,7 +984,7 @@ def test_hash_count_with_filter(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_filters(data_gen, conf): assert_gpu_and_cpu_are_equal_sql( @@ -1784,7 +1741,7 @@ def do_it(spark): @ignore_order(local=True) @approximate_float @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans_with_decimals, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimals, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_std_variance(data_gen, conf): local_conf = copy_and_update(conf, { @@ -1850,7 +1807,7 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled): 'StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp', 'SortArray', 'Alias', 'Literal', 'Count', 'AggregateExpression', 'ProjectExec') -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) diff --git a/integration_tests/src/main/python/hashing_test.py b/integration_tests/src/main/python/hashing_test.py index 107c3a4576e..6bd56da933d 100644 --- a/integration_tests/src/main/python/hashing_test.py +++ b/integration_tests/src/main/python/hashing_test.py @@ -39,7 +39,7 @@ _xxhash_fallback_gens = single_level_array_gens + nested_array_gens_sample + [ all_basic_struct_gen, - struct_array_gen_no_nans, + struct_array_gen, _struct_of_xxhash_gens] if is_before_spark_320(): _xxhash_fallback_gens += [float_gen, double_gen] diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index 5a5347c70bf..b4708b89668 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -134,15 +134,15 @@ ('a', IntegerGen()), ('b', LongGen(nullable=True))] -part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]), +part_and_order_gens = [long_gen, DoubleGen(special_cases=[]), string_gen, boolean_gen, timestamp_gen, DecimalGen(precision=18, scale=1), DecimalGen(precision=38, scale=1)] -running_part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]), +running_part_and_order_gens = [long_gen, DoubleGen(special_cases=[]), string_gen, byte_gen, timestamp_gen, DecimalGen(precision=18, scale=1), DecimalGen(precision=38, scale=1)] -lead_lag_data_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]), +lead_lag_data_gens = [long_gen, DoubleGen(special_cases=[]), boolean_gen, timestamp_gen, string_gen, DecimalGen(precision=18, scale=3), DecimalGen(precision=38, scale=4), StructGen(children=[