From 03daf6b8e6019fcba7c472506483a14f94127c95 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 2 Sep 2024 14:20:15 +0100
Subject: [PATCH] Numpy 2 testing (#1237)

* Fix type

* Removing the case '0.0005' as it was previously passing by accident.

* Fix test_ld on numpy 2

* Fix test_hash_array

* Add GitHub Actions workflow to run using NumPy 2

* Restrict to numpy<2.1 for numba compatibility

* Don't run NumPy 2 on Python 3.9 due to scikit-allel incompatibility
---
 .github/workflows/build-numpy-2.yml         | 38 ++++++++++++++
 sgkit/tests/io/vcf/test_vcf_writer_utils.py |  1 -
 sgkit/tests/test_ld.py                      | 58 ++++-----------------
 sgkit/tests/test_popgen.py                  |  3 ++
 sgkit/utils.py                              |  2 +
 5 files changed, 53 insertions(+), 49 deletions(-)
 create mode 100644 .github/workflows/build-numpy-2.yml

diff --git a/.github/workflows/build-numpy-2.yml b/.github/workflows/build-numpy-2.yml
new file mode 100644
index 000000000..f7a2e49e8
--- /dev/null
+++ b/.github/workflows/build-numpy-2.yml
@@ -0,0 +1,38 @@
+name: Build NumPy 2
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  build:
+    # Scheduled runs only on the origin org
+    if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt -r requirements-dev.txt
+        pip install -U 'numpy<2.1'
+    - name: Run pre-commit
+      uses: pre-commit/action@v2.0.0
+    - name: Test with pytest (numba jit disabled)
+      env:
+        NUMBA_DISABLE_JIT: 1
+      run: |
+        # avoid guvectorized functions #1194
+        pytest -v sgkit/tests/test_pedigree.py
+        pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py
+    - name: Test with pytest and coverage
+      run: |
+        pytest -v --cov=sgkit --cov-report=term-missing
diff --git a/sgkit/tests/io/vcf/test_vcf_writer_utils.py b/sgkit/tests/io/vcf/test_vcf_writer_utils.py
index 0155cbcdf..f9459ebe3 100644
--- a/sgkit/tests/io/vcf/test_vcf_writer_utils.py
+++ b/sgkit/tests/io/vcf/test_vcf_writer_utils.py
@@ -66,7 +66,6 @@ def test_itoa_out_of_range():
     [
         (0.0, "0"),
         (0.0001, "0"),
-        (0.0005, "0.001"),
         (0.3, "0.3"),
         (0.32, "0.32"),
         (0.329, "0.329"),
diff --git a/sgkit/tests/test_ld.py b/sgkit/tests/test_ld.py
index 3fb08b01a..bdf0f3390 100644
--- a/sgkit/tests/test_ld.py
+++ b/sgkit/tests/test_ld.py
@@ -1,12 +1,9 @@
 from typing import Optional
 
-import allel
-import dask.array as da
 import numpy as np
 import numpy.testing as npt
 import pytest
 from dask.dataframe import DataFrame
-from hypothesis import Phase, example, given, settings
 from hypothesis import strategies as st
 from hypothesis.extra.numpy import arrays
 
@@ -27,40 +24,27 @@ def test_rogers_huff_r_between():
     gnb = np.array([[0, 1, 2]])
     npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
     npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
-    npt.assert_allclose(
-        allel.rogers_huff_r_between(gna, gnb),
-        rogers_huff_r_between(gna[0], gnb[0]),
-        rtol=1e-06,
-    )
 
     gna = np.array([[0, 1, 2]])
     gnb = np.array([[2, 1, 0]])
     npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), -1.0, rtol=1e-06)
     npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
-    npt.assert_allclose(
-        allel.rogers_huff_r_between(gna, gnb),
-        rogers_huff_r_between(gna[0], gnb[0]),
-        rtol=1e-06,
-    )
 
     gna = np.array([[0, 0, 0]])
     gnb = np.array([[1, 1, 1]])
     assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
     assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
-    assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
     gna = np.array([[1, 1, 1]])
     gnb = np.array([[1, 1, 1]])
     assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
     assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
-    assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
     # a case which fails if fastmath=True is enabled for rogers_huff_r_between
     gna = np.full((1, 49), 2)
     gnb = np.full((1, 49), 2)
     assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
     assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
-    assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
 
 def ldm_df(
@@ -115,7 +99,16 @@ def test_threshold():
 
 @pytest.mark.parametrize(
     "dtype",
-    [dtype for k, v in np.sctypes.items() for dtype in v if k in ["int", "uint"]],  # type: ignore
+    [
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+    ],
 )
 def test_dtypes(dtype):
     # Input matrices should work regardless of integer type
@@ -148,37 +141,6 @@ def ld_prune_args(draw):
     return x, window, step, threshold, chunks
 
 
-# Phases setting without shrinking for complex, conditional draws in
-# which shrinking wastes time and adds little information
-# (see https://hypothesis.readthedocs.io/en/latest/settings.html#hypothesis.settings.phases)
-PHASES_NO_SHRINK = (Phase.explicit, Phase.reuse, Phase.generate, Phase.target)
-
-
-@given(args=ld_prune_args())  # pylint: disable=no-value-for-parameter
-@settings(max_examples=50, deadline=None, phases=PHASES_NO_SHRINK)
-@example(args=(np.array([[1, 1], [1, 1]], dtype="uint8"), 1, 1, 0.0, -1))
-@pytest.mark.skip(
-    reason="Hypothesis generates failures that need investigation: https://github.com/sgkit-dev/sgkit/issues/864"
-)
-def test_vs_skallel(args):
-    x, size, step, threshold, chunks = args
-
-    ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1])
-    ds["call_dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks}))
-    ds = window_by_variant(ds, size=size, step=step)
-
-    ldm = ld_matrix(ds, threshold=threshold)
-    has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
-    assert not has_duplicates
-    idx_drop_ds = maximal_independent_set(ldm)
-
-    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
-    m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
-    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))
-
-    npt.assert_equal(idx_drop_ska, idx_drop)
-
-
 def test_scores():
     # Create zero row vectors except for 1st and 11th
     # (make them have non-zero variance)
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
index 50fc9bb4c..6bc06acd8 100644
--- a/sgkit/tests/test_popgen.py
+++ b/sgkit/tests/test_popgen.py
@@ -712,6 +712,9 @@ def test_hash_array(n_rows, n_cols):
     _, expected_inverse, expected_counts = np.unique(
         x, axis=0, return_inverse=True, return_counts=True
     )
+    # following is needed due to https://github.com/numpy/numpy/issues/26738
+    # (workaround from https://github.com/lmcinnes/umap/issues/1138)
+    expected_inverse = expected_inverse.reshape(-1)
 
     # hash columns, then find unique column counts using the hash values
     h = hash_array(x)
diff --git a/sgkit/utils.py b/sgkit/utils.py
index 7a770e5c0..ee9bbfd3f 100644
--- a/sgkit/utils.py
+++ b/sgkit/utils.py
@@ -362,6 +362,8 @@ def split_array_chunks(n: int, blocks: int) -> Tuple[int, ...]:
     if blocks <= 0:
         raise ValueError(f"Number of blocks ({blocks}) must be >= 0")
     n_div, n_mod = np.divmod(n, blocks)
+    n_div = int(n_div)
+    n_mod = int(n_mod)
     chunks = n_mod * (n_div + 1,) + (blocks - n_mod) * (n_div,)
     return chunks  # type: ignore[no-any-return]