Skip to content

Commit

Permalink
[Test Coverage] Improve test coverage to 95% (py-why#91)
Browse files Browse the repository at this point in the history
* Add unit tests for BregmanCDTest
* Add unit tests for context builder and remove extraneous LOC
* Add unit tests for G^2 test and fix the errors when conditioning set is "high-dim"

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 committed Jan 13, 2023
1 parent 11e378e commit 7f50370
Show file tree
Hide file tree
Showing 14 changed files with 219 additions and 75 deletions.
2 changes: 2 additions & 0 deletions dodiscover/cd/bregman.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def __init__(
def test(
self, df: pd.DataFrame, x_vars: Set[Column], y_vars: Set[Column], group_col: Column
) -> Tuple[float, float]:
self._check_test_input(df, x_vars, y_vars, group_col)

x_cols = list(x_vars)
y_cols = list(y_vars)
group_ind = df[group_col].to_numpy()
Expand Down
2 changes: 1 addition & 1 deletion dodiscover/ci/clf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def test(

# compute final pvalue
metric = np.mean(boot_metrics)
std_metric = np.std(boot_metrics)
std_metric = np.std(boot_metrics) + 1e-8
sigma = std_metric / np.sqrt(self.n_iter)
else:
metric, pvalue = self._compute_test_statistic(df, x_vars, y_vars, z_covariates)
Expand Down
50 changes: 36 additions & 14 deletions dodiscover/ci/g_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def _calculate_contingency_tble(
# define contingency table as a 2 by 2 table relating 'x' and 'y'
# across different separating set variables
contingency_tble = np.zeros((nlevels_x, nlevels_y, dof))
x_idx = data[x] # [:, x]
y_idx = data[y] # [:, y]
x_idx = data[x]
y_idx = data[y]
sep_set = list(sep_set)

# sum all co-occurrences of x and y conditioned on z
Expand All @@ -82,7 +82,7 @@ def _calculate_highdim_contingency(
x: Column,
y: Column,
sep_set: Set,
data: NDArray,
data: pd.DataFrame,
nlevel_x: int,
nlevels_y: int,
) -> NDArray:
Expand All @@ -101,8 +101,8 @@ def _calculate_highdim_contingency(
'y' must be in the columns of ``data``.
sep_set : set
The set of neighboring nodes of x and y (as a set()).
data : np.ndarray of shape (n_samples, n_variables)
The input data matrix.
data : pandas.DataFrame of shape (n_samples, n_variables)
The input dataframe.
nlevel_x : int
Number of levels of the 'x' variable in the data matrix.
nlevels_y : int
Expand All @@ -118,28 +118,31 @@ def _calculate_highdim_contingency(

# keep track of all variables in the separating set
sep_set = list(sep_set) # type: ignore
k = data[:, sep_set]
sep_col_inds = [ind for ind, col in enumerate(data.columns) if col in sep_set]
x_col_inds = [ind for ind, col in enumerate(data.columns) if col == x]
y_col_inds = [ind for ind, col in enumerate(data.columns) if col == y]
k = data.iloc[:, sep_col_inds]

# count number of value combinations for sepset variables
# observed in the data
dof_count = 1
parents_val = np.array([k[0, :]])
parents_val = np.array([k.iloc[0, :]])

# initialize the contingency table
contingency_tble = np.zeros((2, 2, 1))
xdx = data[0, x]
ydx = data[0, y]
xdx = data.iloc[0, x_col_inds]
ydx = data.iloc[0, y_col_inds]
contingency_tble[xdx, ydx, dof_count - 1] = 1

# check how many parents we can create from the rest of the dataset
for idx in range(1, n_samples):
is_new = True
xdx = data[idx, x]
ydx = data[idx, y]
xdx = data.iloc[idx, x_col_inds]
ydx = data.iloc[idx, y_col_inds]

# comparing the current values of the subset variables to all
# already existing combinations of subset variables values
tcomp = parents_val[:dof_count, :] == k[idx, :]
tcomp = parents_val[:dof_count, :] == k.iloc[idx, :].values
for it_parents in range(dof_count):
if np.all(tcomp[it_parents, :]):
contingency_tble[xdx, ydx, it_parents] += 1
Expand All @@ -150,7 +153,7 @@ def _calculate_highdim_contingency(
# contingency table
if is_new:
dof_count += 1
parents_val = np.r_[parents_val, [k[idx, :]]]
parents_val = np.r_[parents_val, [k.iloc[idx, :]]]

# create a new contingnecy table and update cell counts
# using the original table up to the last value
Expand Down Expand Up @@ -195,8 +198,14 @@ def _calculate_g_statistic(contingency_tble):

# compute the final term in the log
tdijk = tx.dot(ty)

# to prevent dividing by 0
tdijk += 1e-8

tlog[:, :, k] = contingency_tble[:, :, k] * nk[k] / tdijk

# to prevent logging by 0
tlog += 1e-8
log_tlog = np.log(tlog)
G2 = np.nansum(2 * contingency_tble * log_tlog)
return G2
Expand Down Expand Up @@ -374,7 +383,7 @@ def g_square_discrete(

class GSquareCITest(BaseConditionalIndependenceTest):
def __init__(self, data_type: str = "binary"):
"""G squared CI test for discrete or binary data.
r"""G squared CI test for discrete or binary data.
For details of the test see :footcite:`Neapolitan2003`.
Expand All @@ -384,6 +393,19 @@ def __init__(self, data_type: str = "binary"):
The type of data, which can be "binary", or "discrete".
By default "binary".
Notes
-----
G^2 test statistics requires exponentially more samples as the conditioning
set size grows. The exact requirements in this implementation for binary data
is :math:`10 * 2^|S|`, where :math:`|S|` is the cardinality of the conditioning
set :math:`S`. For example, if S is comprised of three variables, then you need
at least 80 samples.
For discrete data, the requirement is :math:`|Y| * |X| * \prod_i |S_i|`, where
:math:`|X|` and :math:`|Y|` are the number of different discrete categories in
the X and Y columns, and :math:`\prod_i |S_i|` is the product of the number
of different categories in each of the conditioning set columns.
References
----------
.. footbibliography::
Expand Down
3 changes: 3 additions & 0 deletions dodiscover/ci/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,15 @@ def nonlinear_additive_gaussian(

# compute nonlinear model
if model_type == "ci":
# X <- Z -> Y
X = nonlinear_func(freq * (Z * Azx + std * X_noise + cause_var))
Y = nonlinear_func(freq * (Z * Azy + std * Y_noise + cause_var))
elif model_type == "ind":
# X, Y, Z
X = nonlinear_func(freq * (std * X_noise + cause_var))
Y = nonlinear_func(freq * (std * Y_noise + cause_var))
elif model_type == "dep":
# X -> Y <- Z
X = nonlinear_func(freq * (std * X_noise + cause_var))
Y = nonlinear_func(freq * (2 * Axy * X + Z * Azy + std * Y_noise + cause_var))

Expand Down
47 changes: 0 additions & 47 deletions tests/unit_tests/cd/test_cd.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import pytest

from dodiscover.cd import KernelCDTest
from dodiscover.cd import BregmanCDTest, KernelCDTest

seed = 12345

Expand All @@ -15,10 +15,10 @@ def single_env_scm(n_samples=200, offset=0.0):

X = rng.standard_normal((n_samples, 1)) + offset
X1 = rng.standard_normal((n_samples, 1)) + offset
Y = X + X1 + 0.5 * rng.standard_normal((n_samples, 1))
Y = X + X1 + 0.1 * rng.standard_normal((n_samples, 1))
Z = Y + 0.1 * rng.standard_normal((n_samples, 1))

# create input for the CI test
# create input for the CD test
df = pd.DataFrame(np.hstack((X, X1, Y, Z)), columns=["x", "x1", "y", "z"])

# assign groups randomly
Expand All @@ -37,17 +37,54 @@ def multi_env_scm():
return df


@pytest.mark.parametrize(
"cd_estimator",
[
KernelCDTest(),
BregmanCDTest(),
],
)
def test_cd_tests_error(cd_estimator):
x = "x"
y = "y"

sample_df = single_env_scm()
with pytest.raises(ValueError, match="The group col"):
cd_estimator.test(sample_df, {x}, {y}, group_col="blah")

with pytest.raises(ValueError, match="The x variables are not all"):
cd_estimator.test(sample_df, {"blah"}, y_vars={y}, group_col="group")

with pytest.raises(ValueError, match="The y variables are not all"):
cd_estimator.test(sample_df, {x}, y_vars={"blah"}, group_col="group")

# all the group indicators have different values now from 0/1
sample_df["group"] = sample_df["group"] + 3
with pytest.raises(RuntimeError, match="Group indications in"):
cd_estimator.test(sample_df, {x}, {y}, group_col="group")


@pytest.mark.parametrize(
["cd_func", "cd_kwargs"],
[
[BregmanCDTest, dict()],
[KernelCDTest, dict()],
[KernelCDTest, {"l2": 1e-3}],
[KernelCDTest, {"l2": (1e-3, 2e-3)}],
],
)
@pytest.mark.parametrize(
["df", "env_type"],
[
[single_env_scm(), "single"],
[multi_env_scm(), "multi"],
],
)
def test_kernel_cd(df, env_type):
"""Test Fisher Z test for Gaussian data."""
def test_cd_simulation(cd_func, df, env_type, cd_kwargs):
"""Test conditional discrepancy tests."""
random_state = 12345
cd_estimator = KernelCDTest(random_state=random_state, null_reps=100, n_jobs=-1)
print(cd_func, cd_kwargs)
cd_estimator = cd_func(random_state=random_state, null_reps=50, n_jobs=-1, **cd_kwargs)

group_col = "group"

Expand All @@ -62,12 +99,12 @@ def test_kernel_cd(df, env_type):
print(pvalue)
assert pvalue > 0.05
elif env_type == "multi":
_, pvalue = cd_estimator.test(df, {"x"}, {"x1"}, group_col=group_col)
assert pvalue < 0.05
print(pvalue)
_, pvalue = cd_estimator.test(df, {"x"}, {"z"}, group_col=group_col)
assert pvalue < 0.05
print(pvalue)
_, pvalue = cd_estimator.test(df, {"x"}, {"y"}, group_col=group_col)
assert pvalue < 0.05
print(pvalue)
_, pvalue = cd_estimator.test(df, {"x1"}, {"z"}, group_col=group_col)
assert pvalue < 0.05
print(pvalue)
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_clf_with_nonlinear_cos_additive():
X, Y, Z = nonlinear_additive_gaussian(model_type="ind", n_samples=n_samples, random_state=rng)
df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])

clf = RandomForestClassifier(random_state=rng)
clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
ci_estimator = ClassifierCITest(clf, random_state=rng)
_, pvalue = ci_estimator.test(df, {"x"}, {"z"})
assert pvalue > 0.05
Expand All @@ -68,11 +68,44 @@ def test_clf_with_nonlinear_cos_additive():
X, Y, Z = nonlinear_additive_gaussian(model_type="dep", n_samples=n_samples, random_state=rng)
df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])

clf = RandomForestClassifier(random_state=rng)
clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
ci_estimator = ClassifierCITest(clf, random_state=rng)
_, pvalue = ci_estimator.test(df, {"x"}, {"z"})
assert pvalue > 0.05
_, pvalue = ci_estimator.test(df, {"z"}, {"y"})
assert pvalue < 0.05
_, pvalue = ci_estimator.test(df, {"x"}, {"z"}, {"y"})
assert pvalue < 0.05


def test_clfcitest_with_bootstrap():
"""Test classifier CI test with bootstrap option."""
# need a decent number of samples to fit the classifiers
n_samples = 1000

# create input for the CI test
X, Y, Z = nonlinear_additive_gaussian(
model_type="ci", n_samples=n_samples, random_state=rng, std=0.001
)
df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])

clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
ci_estimator = ClassifierCITest(clf, random_state=rng, bootstrap=True, n_iter=2)
_, pvalue = ci_estimator.test(df, {"x"}, {"z"})
assert pvalue < 0.05
_, pvalue = ci_estimator.test(df, {"x"}, {"y"})
assert pvalue < 0.05
_, pvalue = ci_estimator.test(df, {"x"}, {"y"}, {"z"})
assert pvalue > 0.05

# create input for the dep test X -> Y <- Z
X, Y, Z = nonlinear_additive_gaussian(
model_type="dep", n_samples=n_samples, random_state=rng, std=0.001
)
clf = RandomForestClassifier(random_state=rng, n_jobs=-1)
ci_estimator = ClassifierCITest(clf, random_state=rng, bootstrap=True, n_iter=2)
df = pd.DataFrame(np.hstack((X, Y, Z)), columns=["x", "y", "z"])
_, pvalue = ci_estimator.test(df, {"z"}, {"y"})
assert pvalue < 0.05
_, pvalue = ci_estimator.test(df, {"x"}, {"z"}, {"y"})
assert pvalue < 0.05
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 7f50370

Please sign in to comment.