diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index a79f1a76d03..9da2a7ccbeb 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -65,7 +65,7 @@ def test_cast_string_date_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, StringGen(date_start_1_2_1)).select(f.col('a').cast(DateType())), + lambda spark : unary_op_df(spark, StringGen(date_start_1_1_1)).select(f.col('a').cast(DateType())), conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim @@ -146,9 +146,9 @@ def test_cast_string_date_non_ansi(): lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) -@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_2_1), - StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), - StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?') +@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_1_1), + StringGen(date_start_1_1_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), + StringGen(date_start_1_1_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?') ], ids=idfn) @allow_non_gpu(*non_utc_allow) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index c83b08cb03a..dd2819a7832 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -1219,3 +1219,9 @@ def get_25_partitions_df(spark): # regexp to generate year from 0002, format is yyyy yyyy_start_0002 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])' + +# regexp to generate year from 0001, format is yyyy +yyyy_start_0001 = '([0-9]{3}[1-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])' + +# regexp to generate date from 0001-02-01, format is yyyy-MM-dd +date_start_1_1_1 = yyyy_start_0001 + '-[0-9]{1,2}-[0-9]{1,2}' diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 1787ec81cee..db5c6f1d070 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -573,8 +573,7 @@ def test_unsupported_fallback_to_date(): # (-62135510400, 253402214400) is the range of seconds that can be represented by timestamp_seconds # considering the influence of time zone. ts_float_gen = SetValuesGen(FloatType(), [0.0, -0.0, 1.0, -1.0, 1.234567, -1.234567, 16777215.0, float('inf'), float('-inf'), float('nan')]) -# FIXME: min_val is changed to -62135410400 bypassing "ValueError: year 0 is out of range" from pySpark. It can be fixed after https://github.com/NVIDIA/spark-rapids/issues/9747 -seconds_gens = [LongGen(min_val=-62135410400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(), +seconds_gens = [LongGen(min_val=-62135510400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(), DoubleGen(min_exp=0, max_exp=32), ts_float_gen, DecimalGen(16, 6), DecimalGen(13, 3), DecimalGen(10, 0), DecimalGen(7, -3), DecimalGen(6, 6)] @pytest.mark.parametrize('data_gen', seconds_gens, ids=idfn) @allow_non_gpu(*non_utc_allow) @@ -609,7 +608,6 @@ def test_timestamp_seconds_decimal_overflow(data_gen): conf={}, error_message='Overflow') -# FIXME: min_val is changed to -62135410400000 bypassing "ValueError: year 0 is out of range" from pySpark. It can be fixed after https://github.com/NVIDIA/spark-rapids/issues/9747 millis_gens = [LongGen(min_val=-62135410400000, max_val=253402214400000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', millis_gens, ids=idfn) @allow_non_gpu(*non_utc_allow) @@ -624,8 +622,7 @@ def test_timestamp_millis_long_overflow(): conf={}, error_message='long overflow') -# FIXME: min_val is changed to -62135410400 bypassing "ValueError: year 0 is out of range" from pySpark. It can be fixed after https://github.com/NVIDIA/spark-rapids/issues/9747 -micros_gens = [LongGen(min_val=-62135410400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()] +micros_gens = [LongGen(min_val=-62135510400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', micros_gens, ids=idfn) @allow_non_gpu(*non_utc_allow) def test_timestamp_micros(data_gen): diff --git a/integration_tests/src/main/python/delta_lake_write_test.py b/integration_tests/src/main/python/delta_lake_write_test.py index 0c02b8be0a2..2ca87f7df38 100644 --- a/integration_tests/src/main/python/delta_lake_write_test.py +++ b/integration_tests/src/main/python/delta_lake_write_test.py @@ -424,7 +424,7 @@ def setup_tables(spark): @pytest.mark.parametrize("ts_write", ["INT96", "TIMESTAMP_MICROS", "TIMESTAMP_MILLIS"], ids=idfn) @pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x") def test_delta_write_legacy_timestamp(spark_tmp_path, ts_write): - gen = TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), + gen = TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(2000, 1, 1, tzinfo=timezone.utc)).with_special_case( datetime(1000, 1, 1, tzinfo=timezone.utc), weight=10.0) data_path = spark_tmp_path + "/DELTA_DATA" diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index 66f069edeff..53a99d32bd2 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -209,7 +209,7 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): reason="fastparquet interprets timestamps in UTC timezone, regardless " "of timezone settings")), # Vanilla case. pytest.param(TimestampGen(nullable=False, - start=datetime(1, 2, 1, tzinfo=timezone.utc), + start=datetime(1, 1, 1, tzinfo=timezone.utc), end=pandas_min_datetime), marks=pytest.mark.xfail(reason="fastparquet reads timestamps preceding 1900 incorrectly.")), ], ids=idfn) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index def66df6cab..69de6a326c3 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -637,7 +637,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format # "yyyy-MM" "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[1-8]{1}[0-9]{3}-[0-3]{1,2}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", # "yyyy" - "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0002 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", + "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0001 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", # "dd/MM/yyyy" "\"[0-9]{2}/[0-9]{2}/[1-8]{1}[0-9]{3}\"", # special constant values diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 94e396c97e3..99a2d4241e8 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -72,7 +72,7 @@ parquet_datetime_gen_simple = [DateGen(start=date(1, 1, 1), end=date(2000, 1, 1)) .with_special_case(date(1000, 1, 1), weight=10.0), - TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), + TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(2000, 1, 1, tzinfo=timezone.utc)) .with_special_case(datetime(1000, 1, 1, tzinfo=timezone.utc), weight=10.0)] parquet_datetime_in_struct_gen = [ @@ -289,8 +289,8 @@ def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_fact @pytest.mark.parametrize('ts_write_data_gen', [('INT96', TimestampGen()), - ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))), - ('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))]) + ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))), + ('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))]) @pytest.mark.parametrize('rebase', ["CORRECTED","EXCEPTION"]) @allow_non_gpu(*non_utc_allow) def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase):