[Test Coverage] Improve test coverage to 95% (py-why#91)

* Add unit tests for BregmanCDTest * Add unit tests for context builder and remove extraneous LOC * Add unit tests for G^2 test and fix the errors when conditioning set is "high-dim" Signed-off-by: Adam Li <adam2392@gmail.com>
ebridge2 · Jan 13, 2023 · 7f50370 · 7f50370
1 parent 11e378e
commit 7f50370
Show file tree

Hide file tree

Showing 14 changed files with 219 additions and 75 deletions.
diff --git a/dodiscover/cd/bregman.py b/dodiscover/cd/bregman.py
@@ -57,6 +57,8 @@ def __init__(
     def test(
         self, df: pd.DataFrame, x_vars: Set[Column], y_vars: Set[Column], group_col: Column
     ) -> Tuple[float, float]:
+        self._check_test_input(df, x_vars, y_vars, group_col)
+
         x_cols = list(x_vars)
         y_cols = list(y_vars)
         group_ind = df[group_col].to_numpy()

diff --git a/dodiscover/ci/clf_test.py b/dodiscover/ci/clf_test.py
@@ -280,7 +280,7 @@ def test(
 
             # compute final pvalue
             metric = np.mean(boot_metrics)
-            std_metric = np.std(boot_metrics)
+            std_metric = np.std(boot_metrics) + 1e-8
             sigma = std_metric / np.sqrt(self.n_iter)
         else:
             metric, pvalue = self._compute_test_statistic(df, x_vars, y_vars, z_covariates)

diff --git a/dodiscover/ci/g_test.py b/dodiscover/ci/g_test.py
@@ -54,8 +54,8 @@ def _calculate_contingency_tble(
     # define contingency table as a 2 by 2 table relating 'x' and 'y'
     # across different separating set variables
     contingency_tble = np.zeros((nlevels_x, nlevels_y, dof))
-    x_idx = data[x]  # [:, x]
-    y_idx = data[y]  # [:, y]
+    x_idx = data[x]
+    y_idx = data[y]
     sep_set = list(sep_set)
 
     # sum all co-occurrences of x and y conditioned on z
@@ -82,7 +82,7 @@ def _calculate_highdim_contingency(
     x: Column,
     y: Column,
     sep_set: Set,
-    data: NDArray,
+    data: pd.DataFrame,
     nlevel_x: int,
     nlevels_y: int,
 ) -> NDArray:
@@ -101,8 +101,8 @@ def _calculate_highdim_contingency(
         'y' must be in the columns of ``data``.
     sep_set : set
         The set of neighboring nodes of x and y (as a set()).
-    data : np.ndarray of shape (n_samples, n_variables)
-        The input data matrix.
+    data : pandas.DataFrame of shape (n_samples, n_variables)
+        The input dataframe.
     nlevel_x : int
         Number of levels of the 'x' variable in the data matrix.
     nlevels_y : int
@@ -118,28 +118,31 @@ def _calculate_highdim_contingency(
 
     # keep track of all variables in the separating set
     sep_set = list(sep_set)  # type: ignore
-    k = data[:, sep_set]
+    sep_col_inds = [ind for ind, col in enumerate(data.columns) if col in sep_set]
+    x_col_inds = [ind for ind, col in enumerate(data.columns) if col == x]
+    y_col_inds = [ind for ind, col in enumerate(data.columns) if col == y]
+    k = data.iloc[:, sep_col_inds]
 
     # count number of value combinations for sepset variables
     # observed in the data
     dof_count = 1
-    parents_val = np.array([k[0, :]])
+    parents_val = np.array([k.iloc[0, :]])
 
     # initialize the contingency table
     contingency_tble = np.zeros((2, 2, 1))
-    xdx = data[0, x]
-    ydx = data[0, y]
+    xdx = data.iloc[0, x_col_inds]
+    ydx = data.iloc[0, y_col_inds]
     contingency_tble[xdx, ydx, dof_count - 1] = 1
 
     # check how many parents we can create from the rest of the dataset
     for idx in range(1, n_samples):
         is_new = True
-        xdx = data[idx, x]
-        ydx = data[idx, y]
+        xdx = data.iloc[idx, x_col_inds]
+        ydx = data.iloc[idx, y_col_inds]
 
         # comparing the current values of the subset variables to all
         # already existing combinations of subset variables values
-        tcomp = parents_val[:dof_count, :] == k[idx, :]
+        tcomp = parents_val[:dof_count, :] == k.iloc[idx, :].values
         for it_parents in range(dof_count):
             if np.all(tcomp[it_parents, :]):
                 contingency_tble[xdx, ydx, it_parents] += 1
@@ -150,7 +153,7 @@ def _calculate_highdim_contingency(
         # contingency table
         if is_new:
             dof_count += 1
-            parents_val = np.r_[parents_val, [k[idx, :]]]
+            parents_val = np.r_[parents_val, [k.iloc[idx, :]]]
 
             # create a new contingnecy table and update cell counts
             # using the original table up to the last value
@@ -195,8 +198,14 @@ def _calculate_g_statistic(contingency_tble):
 
         # compute the final term in the log
         tdijk = tx.dot(ty)
+
+        # to prevent dividing by 0
+        tdijk += 1e-8
+
         tlog[:, :, k] = contingency_tble[:, :, k] * nk[k] / tdijk
 
+    # to prevent logging by 0
+    tlog += 1e-8
     log_tlog = np.log(tlog)
     G2 = np.nansum(2 * contingency_tble * log_tlog)
     return G2
@@ -374,7 +383,7 @@ def g_square_discrete(
 
 class GSquareCITest(BaseConditionalIndependenceTest):
     def __init__(self, data_type: str = "binary"):
-        """G squared CI test for discrete or binary data.
+        r"""G squared CI test for discrete or binary data.
 
         For details of the test see :footcite:`Neapolitan2003`.
 
@@ -384,6 +393,19 @@ def __init__(self, data_type: str = "binary"):
             The type of data, which can be "binary", or "discrete".
             By default "binary".
 
+        Notes
+        -----
+        G^2 test statistics requires exponentially more samples as the conditioning
+        set size grows. The exact requirements in this implementation for binary data
+        is :math:`10 * 2^|S|`, where :math:`|S|` is the cardinality of the conditioning
+        set :math:`S`. For example, if S is comprised of three variables, then you need
+        at least 80 samples.
+
+        For discrete data, the requirement is :math:`|Y| * |X| * \prod_i |S_i|`, where
+        :math:`|X|` and :math:`|Y|` are the number of different discrete categories in
+        the X and Y columns, and :math:`\prod_i |S_i|` is the product of the number
+        of different categories in each of the conditioning set columns.
+
         References
         ----------
         .. footbibliography::

diff --git a/dodiscover/ci/simulate.py b/dodiscover/ci/simulate.py
@@ -93,12 +93,15 @@ def nonlinear_additive_gaussian(
 
     # compute nonlinear model
     if model_type == "ci":
+        # X <- Z -> Y
         X = nonlinear_func(freq * (Z * Azx + std * X_noise + cause_var))
         Y = nonlinear_func(freq * (Z * Azy + std * Y_noise + cause_var))
     elif model_type == "ind":
+        # X, Y, Z
         X = nonlinear_func(freq * (std * X_noise + cause_var))
         Y = nonlinear_func(freq * (std * Y_noise + cause_var))
     elif model_type == "dep":
+        # X -> Y <- Z
         X = nonlinear_func(freq * (std * X_noise + cause_var))
         Y = nonlinear_func(freq * (2 * Axy * X + Z * Azy + std * Y_noise + cause_var))
 

diff --git a/tests/unit_tests/cd/test_cd.py b/tests/unit_tests/cd/test_cd.py
diff --git a/tests/unit_tests/cd/test_kernel_test.py → tests/unit_tests/conditional/cd/test_cd.py b/tests/unit_tests/cd/test_kernel_test.py → tests/unit_tests/conditional/cd/test_cd.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from dodiscover.cd import KernelCDTest
+from dodiscover.cd import BregmanCDTest, KernelCDTest
 
 seed = 12345
 
@@ -15,10 +15,10 @@ def single_env_scm(n_samples=200, offset=0.0):
 
     X = rng.standard_normal((n_samples, 1)) + offset
     X1 = rng.standard_normal((n_samples, 1)) + offset
-    Y = X + X1 + 0.5 * rng.standard_normal((n_samples, 1))
+    Y = X + X1 + 0.1 * rng.standard_normal((n_samples, 1))
     Z = Y + 0.1 * rng.standard_normal((n_samples, 1))
 
-    # create input for the CI test
+    # create input for the CD test
     df = pd.DataFrame(np.hstack((X, X1, Y, Z)), columns=["x", "x1", "y", "z"])
 
     # assign groups randomly
@@ -37,17 +37,54 @@ def multi_env_scm():
     return df
 
 
+@pytest.mark.parametrize(
+    "cd_estimator",
+    [
+        KernelCDTest(),
+        BregmanCDTest(),
+    ],
+)
+def test_cd_tests_error(cd_estimator):
+    x = "x"
+    y = "y"
+
+    sample_df = single_env_scm()
+    with pytest.raises(ValueError, match="The group col"):
+        cd_estimator.test(sample_df, {x}, {y}, group_col="blah")
+
+    with pytest.raises(ValueError, match="The x variables are not all"):
+        cd_estimator.test(sample_df, {"blah"}, y_vars={y}, group_col="group")
+
+    with pytest.raises(ValueError, match="The y variables are not all"):
+        cd_estimator.test(sample_df, {x}, y_vars={"blah"}, group_col="group")
+
+    # all the group indicators have different values now from 0/1
+    sample_df["group"] = sample_df["group"] + 3
+    with pytest.raises(RuntimeError, match="Group indications in"):
+        cd_estimator.test(sample_df, {x}, {y}, group_col="group")
+
+
+@pytest.mark.parametrize(
+    ["cd_func", "cd_kwargs"],
+    [
+        [BregmanCDTest, dict()],
+        [KernelCDTest, dict()],
+        [KernelCDTest, {"l2": 1e-3}],
+        [KernelCDTest, {"l2": (1e-3, 2e-3)}],
+    ],
+)
 @pytest.mark.parametrize(
     ["df", "env_type"],
     [
         [single_env_scm(), "single"],
         [multi_env_scm(), "multi"],
     ],
 )
-def test_kernel_cd(df, env_type):
-    """Test Fisher Z test for Gaussian data."""
+def test_cd_simulation(cd_func, df, env_type, cd_kwargs):
+    """Test conditional discrepancy tests."""
     random_state = 12345
-    cd_estimator = KernelCDTest(random_state=random_state, null_reps=100, n_jobs=-1)
+    print(cd_func, cd_kwargs)
+    cd_estimator = cd_func(random_state=random_state, null_reps=50, n_jobs=-1, **cd_kwargs)
 
     group_col = "group"
 
@@ -62,12 +99,12 @@ def test_kernel_cd(df, env_type):
         print(pvalue)
         assert pvalue > 0.05
     elif env_type == "multi":
-        _, pvalue = cd_estimator.test(df, {"x"}, {"x1"}, group_col=group_col)
-        assert pvalue < 0.05
-        print(pvalue)
         _, pvalue = cd_estimator.test(df, {"x"}, {"z"}, group_col=group_col)
         assert pvalue < 0.05
         print(pvalue)
         _, pvalue = cd_estimator.test(df, {"x"}, {"y"}, group_col=group_col)
         assert pvalue < 0.05
         print(pvalue)
+        _, pvalue = cd_estimator.test(df, {"x1"}, {"z"}, group_col=group_col)
+        assert pvalue < 0.05
+        print(pvalue)
diff --git a/tests/unit_tests/ci/__init__.py → tests/unit_tests/conditional/ci/__init__.py b/tests/unit_tests/ci/__init__.py → tests/unit_tests/conditional/ci/__init__.py
diff --git a/tests/unit_tests/ci/test_ci.py → tests/unit_tests/conditional/ci/test_ci.py b/tests/unit_tests/ci/test_ci.py → tests/unit_tests/conditional/ci/test_ci.py
diff --git a/tests/unit_tests/ci/test_clf_test.py → ...nit_tests/conditional/ci/test_clf_test.py b/tests/unit_tests/ci/test_clf_test.py → ...nit_tests/conditional/ci/test_clf_test.py
@@ -53,7 +53,7 @@ def test_clf_with_nonlinear_cos_additive():
     X, Y, Z = nonlinear_additive_gaussian(model_type="ind", n_samples=n_samples, random_state=rng)
     df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])
 
-    clf = RandomForestClassifier(random_state=rng)
+    clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
     ci_estimator = ClassifierCITest(clf, random_state=rng)
     _, pvalue = ci_estimator.test(df, {"x"}, {"z"})
     assert pvalue > 0.05
@@ -68,11 +68,44 @@ def test_clf_with_nonlinear_cos_additive():
     X, Y, Z = nonlinear_additive_gaussian(model_type="dep", n_samples=n_samples, random_state=rng)
     df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])
 
-    clf = RandomForestClassifier(random_state=rng)
+    clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
     ci_estimator = ClassifierCITest(clf, random_state=rng)
     _, pvalue = ci_estimator.test(df, {"x"}, {"z"})
     assert pvalue > 0.05
     _, pvalue = ci_estimator.test(df, {"z"}, {"y"})
     assert pvalue < 0.05
     _, pvalue = ci_estimator.test(df, {"x"}, {"z"}, {"y"})
     assert pvalue < 0.05
+
+
+def test_clfcitest_with_bootstrap():
+    """Test classifier CI test with bootstrap option."""
+    # need a decent number of samples to fit the classifiers
+    n_samples = 1000
+
+    # create input for the CI test
+    X, Y, Z = nonlinear_additive_gaussian(
+        model_type="ci", n_samples=n_samples, random_state=rng, std=0.001
+    )
+    df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])
+
+    clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
+    ci_estimator = ClassifierCITest(clf, random_state=rng, bootstrap=True, n_iter=2)
+    _, pvalue = ci_estimator.test(df, {"x"}, {"z"})
+    assert pvalue < 0.05
+    _, pvalue = ci_estimator.test(df, {"x"}, {"y"})
+    assert pvalue < 0.05
+    _, pvalue = ci_estimator.test(df, {"x"}, {"y"}, {"z"})
+    assert pvalue > 0.05
+
+    # create input for the dep test X -> Y <- Z
+    X, Y, Z = nonlinear_additive_gaussian(
+        model_type="dep", n_samples=n_samples, random_state=rng, std=0.001
+    )
+    clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
+    ci_estimator = ClassifierCITest(clf, random_state=rng, bootstrap=True, n_iter=2)
+    df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])
+    _, pvalue = ci_estimator.test(df, {"z"}, {"y"})
+    assert pvalue < 0.05
+    _, pvalue = ci_estimator.test(df, {"x"}, {"z"}, {"y"})
+    assert pvalue < 0.05
diff --git a/tests/unit_tests/ci/test_cmi_test.py → ...nit_tests/conditional/ci/test_cmi_test.py b/tests/unit_tests/ci/test_cmi_test.py → ...nit_tests/conditional/ci/test_cmi_test.py
diff --git a/tests/unit_tests/ci/test_fisherz_test.py → ...tests/conditional/ci/test_fisherz_test.py b/tests/unit_tests/ci/test_fisherz_test.py → ...tests/conditional/ci/test_fisherz_test.py