test(python): Use pytest marker for slow tests (#6642)

pola-rs · Feb 3, 2023 · 66dae4d · 66dae4d
1 parent 5ba53a4
commit 66dae4d
Show file tree

Hide file tree

Showing 15 changed files with 128 additions and 156 deletions.
diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml
@@ -65,7 +65,7 @@ jobs:
           maturin develop
 
       - name: Run tests and report coverage
-        run: pytest --cov --numprocesses=auto
+        run: pytest --cov -n auto -m "slow or not slow"
 
       - name: Run doctests
         run: python tests/docs/run_doc_examples.py
@@ -129,7 +129,7 @@ jobs:
           pip install target/wheels/polars-*.whl
 
       - name: Run tests
-        run: pytest --numprocesses=auto
+        run: pytest -n auto -m "slow or not slow"
 
       - name: Check import without optional dependencies
         run: |

diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock
diff --git a/py-polars/Makefile b/py-polars/Makefile
@@ -42,20 +42,20 @@ pre-commit: fmt clippy  ## Run all code quality checks
 
 .PHONY: test
 test: venv build  ## Run fast unittests
-	$(VENV_BIN)/pytest tests/unit/ --numprocesses=auto
+	$(VENV_BIN)/pytest -n auto
 
 .PHONY: doctest
 doctest: venv build  ## Run doctests
 	$(VENV_BIN)/python tests/docs/run_doc_examples.py
 
 .PHONY: test-all
 test-all: venv build  ## Run all tests
-	$(VENV_BIN)/pytest --numprocesses=auto
+	$(VENV_BIN)/pytest -n auto -m "slow or not slow"
 	$(VENV_BIN)/python tests/docs/run_doc_examples.py
 
 .PHONY: coverage
 coverage: venv build  ## Run tests and report coverage
-	$(VENV_BIN)/pytest --cov --numprocesses=auto
+	$(VENV_BIN)/pytest --cov -n auto -m "slow or not slow"
 
 .PHONY: clean
 clean:  ## Clean up caches and build artifacts

diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
@@ -98,6 +98,10 @@ warn_return_any = false
 [tool.pytest.ini_options]
 addopts = [
   "--import-mode=importlib",
+  "-m not slow and not hypothesis", # Default to running fast tests only. To run ALL tests, run: pytest -m ""
+]
+markers = [
+  "slow: Tests with a longer than average runtime.",
 ]
 
 [tool.coverage.run]

diff --git a/py-polars/tests/slow/__init__.py b/py-polars/tests/slow/__init__.py
diff --git a/py-polars/tests/slow/conftest.py b/py-polars/tests/slow/conftest.py
diff --git a/py-polars/tests/slow/read_csv.py b/py-polars/tests/slow/read_csv.py
diff --git a/py-polars/tests/slow/test_overflow.py b/py-polars/tests/slow/test_overflow.py
diff --git a/py-polars/tests/slow/test_parquet.py b/py-polars/tests/slow/test_parquet.py
diff --git a/py-polars/tests/slow/test_streaming.py b/py-polars/tests/slow/test_streaming.py
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
@@ -1126,3 +1126,10 @@ def test_read_csv_chunked() -> None:
 
     # The next value should always be higher if monotonically increasing.
     assert df.filter(pl.col("count") < pl.col("count").shift(1)).is_empty()
+
+
+@pytest.mark.slow()
+def test_read_web_file() -> None:
+    url = "https://raw.githubusercontent.com/pola-rs/polars/master/examples/datasets/foods1.csv"
+    df = pl.read_csv(url)
+    assert df.shape == (27, 4)
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
@@ -7,8 +7,10 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 import pytest
 
@@ -402,3 +404,40 @@ def test_fetch_union() -> None:
 
     expected = pl.DataFrame({"a": [0, 3], "b": [1, 4]})
     assert_frame_equal(result_glob, expected)
+
+
+@pytest.mark.slow()
+@typing.no_type_check
+@pytest.mark.xfail(sys.platform == "win32", reason="Does not work on Windows")
+def test_struct_pyarrow_dataset_5796() -> None:
+    num_rows = 2**17 + 1
+
+    df = pl.from_records(
+        [dict(id=i, nested=dict(a=i)) for i in range(num_rows)]  # noqa: C408
+    )
+    with tempfile.TemporaryDirectory() as temp_dir:
+        file_path = Path(temp_dir) / "out.parquet"
+        df.write_parquet(file_path, use_pyarrow=True)
+        tbl = ds.dataset(file_path).to_table()
+        result = pl.from_arrow(tbl)
+
+    assert_frame_equal(result, df)
+
+
+@pytest.mark.slow()
+@pytest.mark.parametrize("case", [1048576, 1048577])
+def test_parquet_chunks_545(case: int) -> None:
+    f = io.BytesIO()
+    # repeat until it has case instances
+    df = pd.DataFrame(
+        np.tile([1.0, pd.to_datetime("2010-10-10")], [case, 1]),
+        columns=["floats", "dates"],
+    )
+
+    # write as parquet
+    df.to_parquet(f)
+    f.seek(0)
+
+    # read it with polars
+    polars_df = pl.read_parquet(f)
+    assert_frame_equal(pl.DataFrame(df), polars_df)
diff --git a/py-polars/tests/unit/test_groupby.py b/py-polars/tests/unit/test_groupby.py
@@ -491,3 +491,17 @@ def test_groupby_dynamic_iter() -> None:
         ((2, datetime(2020, 1, 1, 11)), (1, 3)),
     ]
     assert result2 == expected2
+
+
+@pytest.mark.slow()
+@pytest.mark.parametrize("dtype", [pl.Int32, pl.UInt32])
+def test_overflow_mean_partitioned_groupby_5194(dtype: pl.PolarsDataType) -> None:
+    df = pl.DataFrame(
+        [
+            pl.Series("data", [10_00_00_00] * 100_000, dtype=dtype),
+            pl.Series("group", [1, 2] * 50_000, dtype=dtype),
+        ]
+    )
+    assert df.groupby("group").agg(pl.col("data").mean()).sort(by="group").to_dict(
+        False
+    ) == {"group": [1, 2], "data": [10000000.0, 10000000.0]}
diff --git a/py-polars/tests/unit/test_joins.py b/py-polars/tests/unit/test_joins.py
@@ -74,38 +74,35 @@ def test_join_same_cat_src() -> None:
     }
 
 
-def test_sorted_merge_joins() -> None:
-    for reverse in [False, True]:
-        n = 30
-        df_a = pl.DataFrame(
-            {"a": np.sort(np.random.randint(0, n // 2, n))}
-        ).with_row_count("row_a")
-
-        df_b = pl.DataFrame(
-            {"a": np.sort(np.random.randint(0, n // 2, n // 2))}
-        ).with_row_count("row_b")
-
-        if reverse:
-            df_a = df_a.select(pl.all().reverse())
-            df_b = df_b.select(pl.all().reverse())
-
-        join_strategies: list[JoinStrategy] = ["left", "inner"]
-        for cast_to in [int, str, float]:
-            for how in join_strategies:
-                df_a_ = df_a.with_columns(pl.col("a").cast(cast_to))
-                df_b_ = df_b.with_columns(pl.col("a").cast(cast_to))
-
-                # hash join
-                out_hash_join = df_a_.join(df_b_, on="a", how=how)
-
-                # sorted merge join
-                out_sorted_merge_join = df_a_.with_columns(
-                    pl.col("a").set_sorted(reverse)
-                ).join(
-                    df_b_.with_columns(pl.col("a").set_sorted(reverse)), on="a", how=how
-                )
-
-                assert_frame_equal(out_hash_join, out_sorted_merge_join)
+@pytest.mark.parametrize("reverse", [False, True])
+def test_sorted_merge_joins(reverse: bool) -> None:
+    n = 30
+    df_a = pl.DataFrame({"a": np.sort(np.random.randint(0, n // 2, n))}).with_row_count(
+        "row_a"
+    )
+    df_b = pl.DataFrame(
+        {"a": np.sort(np.random.randint(0, n // 2, n // 2))}
+    ).with_row_count("row_b")
+
+    if reverse:
+        df_a = df_a.select(pl.all().reverse())
+        df_b = df_b.select(pl.all().reverse())
+
+    join_strategies: list[JoinStrategy] = ["left", "inner"]
+    for cast_to in [int, str, float]:
+        for how in join_strategies:
+            df_a_ = df_a.with_columns(pl.col("a").cast(cast_to))
+            df_b_ = df_b.with_columns(pl.col("a").cast(cast_to))
+
+            # hash join
+            out_hash_join = df_a_.join(df_b_, on="a", how=how)
+
+            # sorted merge join
+            out_sorted_merge_join = df_a_.with_columns(
+                pl.col("a").set_sorted(reverse)
+            ).join(df_b_.with_columns(pl.col("a").set_sorted(reverse)), on="a", how=how)
+
+            assert_frame_equal(out_hash_join, out_sorted_merge_join)
 
 
 def test_join_negative_integers() -> None:

diff --git a/py-polars/tests/unit/test_streaming.py b/py-polars/tests/unit/test_streaming.py
@@ -1,11 +1,12 @@
+import time
 from datetime import date
 from typing import Any
 
 import numpy as np
 import pytest
 
 import polars as pl
-from polars.testing import assert_frame_equal
+from polars.testing import assert_frame_equal, assert_series_equal
 
 
 def test_streaming_groupby_types() -> None:
@@ -215,3 +216,30 @@ def test_streaming_streamable_functions(monkeypatch: Any, capfd: Any) -> None:
 
     (_, err) = capfd.readouterr()
     assert "df -> function -> ordered_sink" in err
+
+
+@pytest.mark.slow()
+def test_cross_join_stack() -> None:
+    a = pl.Series(np.arange(100_000)).to_frame().lazy()
+    t0 = time.time()
+    # this should be instant if directly pushed into sink
+    # if not the cross join will first fill the stack with all matches of a single chunk
+    assert a.join(a, how="cross").head().collect(streaming=True).shape == (5, 2)
+    t1 = time.time()
+    assert (t1 - t0) < 0.5
+
+
+@pytest.mark.slow()
+def test_ooc_sort(monkeypatch: Any) -> None:
+    monkeypatch.setenv("POLARS_FORCE_OOC_SORT", "1")
+
+    s = pl.arange(0, 100_000, eager=True).rename("idx")
+
+    df = s.shuffle().to_frame()
+
+    for reverse in [True, False]:
+        out = (
+            df.lazy().sort("idx", reverse=reverse).collect(streaming=True)
+        ).to_series()
+
+        assert_series_equal(out, s.sort(reverse=reverse))