Add large_data_test mark in pytest; Add a case to test ORC writing wi…

…th lots of nulls (#8825) Signed-off-by: Chong Gao <res_life@163.com>
NVIDIA · Jul 28, 2023 · 55b75f4 · 55b75f4
1 parent a215eab
commit 55b75f4
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 1 deletion.
diff --git a/integration_tests/README.md b/integration_tests/README.md
@@ -398,6 +398,10 @@ properly without it. These tests assume Delta Lake is not configured and are dis
 If Spark has been configured to support Delta Lake then these tests can be enabled by adding the
 `--delta_lake` option to the command.
 
+### Enabling large data tests
+Some tests are testing large data which will take a long time. By default, these tests are disabled.
+These tests can be enabled by adding the `--large_data_test` option to the command.
+
 ## Writing tests
 
 There are a number of libraries provided to help someone write new tests.

diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
@@ -56,3 +56,6 @@ def pytest_addoption(parser):
         "--force_parquet_testing_tests", action="store_true", default=False,
         help="if true forces parquet-testing tests to fail if input data cannot be found"
     )
+    parser.addoption(
+        "--large_data_test", action='store_true', default=False, help="if enable tests with large data"
+    )
diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini
@@ -33,5 +33,6 @@ markers =
     iceberg: Mark a test that requires Iceberg has been configured, skipping if tests are not configured for Iceberg
     delta_lake: Mark a test that requires Delta Lake has been configured, skipping if tests are not configured for Delta Lake
     regexp: Mark a test that tests regular expressions on the GPU (only works when UTF-8 is enabled)
+    large_data_test: Mark tests with large data
 filterwarnings =
     ignore:.*pytest.mark.order.*:_pytest.warning_types.PytestUnknownMarkWarning
diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
@@ -218,6 +218,10 @@ def pytest_runtest_setup(item):
         if not item.config.getoption('delta_lake'):
             pytest.skip('delta lake tests not configured to run')
 
+    if item.get_closest_marker('large_data_test'):
+        if not item.config.getoption('large_data_test'):
+            pytest.skip('tests for large data not configured to run')
+
 def pytest_configure(config):
     global _runtime_env
     _runtime_env = config.getoption('runtime_env')

diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py
@@ -30,3 +30,4 @@
 fuzz_test = pytest.mark.fuzz_test
 iceberg = pytest.mark.iceberg
 delta_lake = pytest.mark.delta_lake
+large_data_test = pytest.mark.large_data_test
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
@@ -15,7 +15,7 @@
 import pytest
 
 from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \
-    assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect
+    assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql
 from data_gen import *
 from marks import *
 from pyspark.sql.types import *
@@ -902,3 +902,32 @@ def gen_null_df(spark):
     gpu_file_path = data_path + "/GPU"
     reader = read_orc_df(gpu_file_path)
     assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
+
+@ignore_order
+@large_data_test
+@pytest.mark.parametrize("reader_confs", reader_opt_confs, ids=idfn)
+def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs):
+    data_path = spark_tmp_path + "/ORC_DATA"
+    all_confs = reader_confs
+    data = [(i, None, None, None, None) for i in range(1000000)]
+    def gen_null_df(spark):
+        return spark.createDataFrame(
+            data,
+            "c1 int, c2 long, c3 float, c4 double, c5 boolean")
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_null_df(spark).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf=all_confs)
+    gpu_file_path = data_path + "/CPU"
+    sqls = ["SELECT * FROM my_large_table",
+            "SELECT * FROM my_large_table WHERE c2 = 5",
+            "SELECT COUNT(*) FROM my_large_table WHERE c3 IS NOT NULL",
+            "SELECT * FROM my_large_table WHERE c4 IS NULL",
+            "SELECT * FROM my_large_table WHERE c5 IS NULL",
+            ]
+    for sql in sqls:
+        assert_gpu_and_cpu_are_equal_sql(
+            lambda spark: spark.read.orc(gpu_file_path),
+            "my_large_table",
+            sql)