From 244ceab14390466091e9d43d4be1971608b3fc88 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Thu, 16 Nov 2023 19:33:12 -0800 Subject: [PATCH] Fix `TimestampGen` to generate value not too close to the minimum allowed timestamp [databricks] (#9736) * Add check for nested types * Add check for nested types * Recursively check for rebasing * Extract common code * Allow nested type in rebase check * Enable nested timestamp in roundtrip test * Fix another test Signed-off-by: Nghia Truong * Enable `LEGACY` rebase in read * Remove comment * Change function/class signatures * Complete modification * Misc Signed-off-by: Nghia Truong * Add explicit type Signed-off-by: Nghia Truong * Rename file and add some stuff in DateTimeRebaseHelpers.scala * Move file and rename class * Adopt new enum type Signed-off-by: Nghia Truong * Add name for the enum classes * Change exception messages * Does not yet support legacy rebase in read Signed-off-by: Nghia Truong * Change legacy to corrected mode Signed-off-by: Nghia Truong * Extract common code Signed-off-by: Nghia Truong * Rename functions Signed-off-by: Nghia Truong * Reformat Signed-off-by: Nghia Truong * Make classes serializable Signed-off-by: Nghia Truong * Revert "Support rebase checking for nested dates and timestamps (#9617)" This reverts commit 401d0d89d27f69831fbd132540424573e51ad296. Signed-off-by: Nghia Truong # Conflicts: # sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala * Implement date time rebase * Optimize rebase op * Change comment Signed-off-by: Nghia Truong * Move tests * Add test for datatime rebase Signed-off-by: Nghia Truong * Various changes Signed-off-by: Nghia Truong * Various changes Signed-off-by: Nghia Truong # Conflicts: # sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala * Fix compile errors Signed-off-by: Nghia Truong * Fix comments Signed-off-by: Nghia Truong * Fix indentations Signed-off-by: Nghia Truong * Change comments and indentations Signed-off-by: Nghia Truong * Allow nested check for rebase * Write different timestamp types in test * Fix conversion if timestamp is not micros * Rename var * Dont have to down cast after up cast Signed-off-by: Nghia Truong * Change comment Signed-off-by: Nghia Truong * Still cast timestamp to the old type after rebasing Signed-off-by: Nghia Truong * Rename test Signed-off-by: Nghia Truong * Should not transform non-datetime types Signed-off-by: Nghia Truong * Fix test * Update tests Signed-off-by: Nghia Truong * Enable int96 rebase in write * Change tests Signed-off-by: Nghia Truong * Complete tests Signed-off-by: Nghia Truong * Revert unrelated changes Signed-off-by: Nghia Truong * Change configs Signed-off-by: Nghia Truong * Merge tests Signed-off-by: Nghia Truong * Simplify test data Signed-off-by: Nghia Truong * Add a new write test Signed-off-by: Nghia Truong * Add a mixed rebase test Signed-off-by: Nghia Truong * Change tests Signed-off-by: Nghia Truong * Fix `seed` in tests Signed-off-by: Nghia Truong * Rename tests Signed-off-by: Nghia Truong * Remove seed override * Change TimestampGen Signed-off-by: Nghia Truong * Remove default seed Signed-off-by: Nghia Truong * Add default seed Signed-off-by: Nghia Truong * Remove default seed Signed-off-by: Nghia Truong --------- Signed-off-by: Nghia Truong --- integration_tests/src/main/python/csv_test.py | 1 - integration_tests/src/main/python/data_gen.py | 4 ++-- integration_tests/src/main/python/parquet_test.py | 1 - integration_tests/src/main/python/parquet_write_test.py | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 5ea3d7c0478..19ad8d29151 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -402,7 +402,6 @@ def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, "'T'HH:mm[:ss]", "'T'HH:mm"] -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9701') @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 3498066c086..64696aace4e 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -578,9 +578,9 @@ def __init__(self, start=None, end=None, nullable=True, tzinfo=timezone.utc): # Spark supports times starting at # "0001-01-01 00:00:00.000000" # but it has issues if you get really close to that because it tries to do things - # in a different format which causes roundoff, so we have to add a few days, + # in a different format which causes roundoff, so we have to add a few days, even a month, # just to be sure - start = datetime(1, 1, 3, tzinfo=tzinfo) + start = datetime(1, 2, 1, tzinfo=tzinfo) elif not isinstance(start, datetime): raise RuntimeError('Unsupported type passed in for start {}'.format(start)) diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index dc959fe64cb..8efacc18d3e 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -311,7 +311,6 @@ def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1 lambda spark: rf(spark).select(f.col('a') >= s0), conf=all_confs) -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9701') @pytest.mark.parametrize('parquet_gens', [parquet_nested_datetime_gen], ids=idfn) @pytest.mark.parametrize('ts_type', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 3e9a8d90f39..bd330b569bb 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -458,7 +458,6 @@ def generate_map_with_empty_validity(spark, path): lambda spark, path: spark.read.parquet(path), data_path) -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9701') @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', ['EXCEPTION']) @@ -475,7 +474,6 @@ def writeParquetCatchException(spark, data_gen, data_path): lambda spark: writeParquetCatchException(spark, data_gen, data_path), conf=all_confs) -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9701') @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')])