From 03daf6b8e6019fcba7c472506483a14f94127c95 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 2 Sep 2024 14:20:15 +0100 Subject: [PATCH] Numpy 2 testing (#1237) * Fix type * Removing the case '0.0005' as it was previously passing by accident. * Fix test_ld on numpy 2 * Fix test_hash_array * Add GitHub Actions workflow to run using NumPy 2 * Restrict to numpy<2.1 for numba compatibility * Don't run NumPy 2 on Python 3.9 due to scikit-allel incompatibility --- .github/workflows/build-numpy-2.yml | 38 ++++++++++++++ sgkit/tests/io/vcf/test_vcf_writer_utils.py | 1 - sgkit/tests/test_ld.py | 58 ++++----------------- sgkit/tests/test_popgen.py | 3 ++ sgkit/utils.py | 2 + 5 files changed, 53 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/build-numpy-2.yml diff --git a/.github/workflows/build-numpy-2.yml b/.github/workflows/build-numpy-2.yml new file mode 100644 index 000000000..f7a2e49e8 --- /dev/null +++ b/.github/workflows/build-numpy-2.yml @@ -0,0 +1,38 @@ +name: Build NumPy 2 + +on: + push: + pull_request: + +jobs: + build: + # Scheduled runs only on the origin org + if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule') + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11"] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt -r requirements-dev.txt + pip install -U 'numpy<2.1' + - name: Run pre-commit + uses: pre-commit/action@v2.0.0 + - name: Test with pytest (numba jit disabled) + env: + NUMBA_DISABLE_JIT: 1 + run: | + # avoid guvectorized functions #1194 + pytest -v sgkit/tests/test_pedigree.py + pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py + - name: Test with pytest and coverage + run: | + pytest -v --cov=sgkit --cov-report=term-missing diff --git a/sgkit/tests/io/vcf/test_vcf_writer_utils.py b/sgkit/tests/io/vcf/test_vcf_writer_utils.py index 0155cbcdf..f9459ebe3 100644 --- a/sgkit/tests/io/vcf/test_vcf_writer_utils.py +++ b/sgkit/tests/io/vcf/test_vcf_writer_utils.py @@ -66,7 +66,6 @@ def test_itoa_out_of_range(): [ (0.0, "0"), (0.0001, "0"), - (0.0005, "0.001"), (0.3, "0.3"), (0.32, "0.32"), (0.329, "0.329"), diff --git a/sgkit/tests/test_ld.py b/sgkit/tests/test_ld.py index 3fb08b01a..bdf0f3390 100644 --- a/sgkit/tests/test_ld.py +++ b/sgkit/tests/test_ld.py @@ -1,12 +1,9 @@ from typing import Optional -import allel -import dask.array as da import numpy as np import numpy.testing as npt import pytest from dask.dataframe import DataFrame -from hypothesis import Phase, example, given, settings from hypothesis import strategies as st from hypothesis.extra.numpy import arrays @@ -27,40 +24,27 @@ def test_rogers_huff_r_between(): gnb = np.array([[0, 1, 2]]) npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), 1.0, rtol=1e-06) npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06) - npt.assert_allclose( - allel.rogers_huff_r_between(gna, gnb), - rogers_huff_r_between(gna[0], gnb[0]), - rtol=1e-06, - ) gna = np.array([[0, 1, 2]]) gnb = np.array([[2, 1, 0]]) npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), -1.0, rtol=1e-06) npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06) - npt.assert_allclose( - allel.rogers_huff_r_between(gna, gnb), - rogers_huff_r_between(gna[0], gnb[0]), - rtol=1e-06, - ) gna = np.array([[0, 0, 0]]) gnb = np.array([[1, 1, 1]]) assert np.isnan(rogers_huff_r_between(gna[0], gnb[0])) assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0])) - assert np.isnan(allel.rogers_huff_r_between(gna, gnb)) gna = np.array([[1, 1, 1]]) gnb = np.array([[1, 1, 1]]) assert np.isnan(rogers_huff_r_between(gna[0], gnb[0])) assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0])) - assert np.isnan(allel.rogers_huff_r_between(gna, gnb)) # a case which fails if fastmath=True is enabled for rogers_huff_r_between gna = np.full((1, 49), 2) gnb = np.full((1, 49), 2) assert np.isnan(rogers_huff_r_between(gna[0], gnb[0])) assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0])) - assert np.isnan(allel.rogers_huff_r_between(gna, gnb)) def ldm_df( @@ -115,7 +99,16 @@ def test_threshold(): @pytest.mark.parametrize( "dtype", - [dtype for k, v in np.sctypes.items() for dtype in v if k in ["int", "uint"]], # type: ignore + [ + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ], ) def test_dtypes(dtype): # Input matrices should work regardless of integer type @@ -148,37 +141,6 @@ def ld_prune_args(draw): return x, window, step, threshold, chunks -# Phases setting without shrinking for complex, conditional draws in -# which shrinking wastes time and adds little information -# (see https://hypothesis.readthedocs.io/en/latest/settings.html#hypothesis.settings.phases) -PHASES_NO_SHRINK = (Phase.explicit, Phase.reuse, Phase.generate, Phase.target) - - -@given(args=ld_prune_args()) # pylint: disable=no-value-for-parameter -@settings(max_examples=50, deadline=None, phases=PHASES_NO_SHRINK) -@example(args=(np.array([[1, 1], [1, 1]], dtype="uint8"), 1, 1, 0.0, -1)) -@pytest.mark.skip( - reason="Hypothesis generates failures that need investigation: https://github.com/sgkit-dev/sgkit/issues/864" -) -def test_vs_skallel(args): - x, size, step, threshold, chunks = args - - ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) - ds["call_dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks})) - ds = window_by_variant(ds, size=size, step=step) - - ldm = ld_matrix(ds, threshold=threshold) - has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any() - assert not has_duplicates - idx_drop_ds = maximal_independent_set(ldm) - - idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) - m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold) - idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1)) - - npt.assert_equal(idx_drop_ska, idx_drop) - - def test_scores(): # Create zero row vectors except for 1st and 11th # (make them have non-zero variance) diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index 50fc9bb4c..6bc06acd8 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -712,6 +712,9 @@ def test_hash_array(n_rows, n_cols): _, expected_inverse, expected_counts = np.unique( x, axis=0, return_inverse=True, return_counts=True ) + # following is needed due to https://github.com/numpy/numpy/issues/26738 + # (workaround from https://github.com/lmcinnes/umap/issues/1138) + expected_inverse = expected_inverse.reshape(-1) # hash columns, then find unique column counts using the hash values h = hash_array(x) diff --git a/sgkit/utils.py b/sgkit/utils.py index 7a770e5c0..ee9bbfd3f 100644 --- a/sgkit/utils.py +++ b/sgkit/utils.py @@ -362,6 +362,8 @@ def split_array_chunks(n: int, blocks: int) -> Tuple[int, ...]: if blocks <= 0: raise ValueError(f"Number of blocks ({blocks}) must be >= 0") n_div, n_mod = np.divmod(n, blocks) + n_div = int(n_div) + n_mod = int(n_mod) chunks = n_mod * (n_div + 1,) + (blocks - n_mod) * (n_div,) return chunks # type: ignore[no-any-return]