Feature/distances (scverse#217)

* Initial commit Adds tools/_distances.py * First prototype framework of _distances.py * Implement FastPermutationTest, Google docstrings * Restructure distances.py Split off PermutationTests into its own file * Update to fit nox requirements * Add tutorial notebooks for distance and perm tests * Init tests for distances and PermutationTest * Bugfix assertions in distance related tests * Fixes for nox * Add distances and test to usage.md * Directly apply some suggestions from code review Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net> * Merge * Add dataloader for distances_example_data. h5ad * Use distance example dataset in usage.md * Resolve most review comments * Docstrings are naturally inherited in python>=3.5 Subclass methods that overwrite a parent class method inherit the parent method's class docstring. * Store precomputed in adata, add linear MMD * Add MMD and WassersteinDistance * Batch-apply some suggestions from code review Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net> * Parallelize distance tests; add info to docstings * Reformat code to comply with nox * Move distances Signed-off-by: zethson <lukas.heumos@posteo.net> * Fix docs Signed-off-by: zethson <lukas.heumos@posteo.net> * submodule update Signed-off-by: zethson <lukas.heumos@posteo.net> --------- Signed-off-by: zethson <lukas.heumos@posteo.net> Co-authored-by: peidli <stefanpeidli@gmail.com>
eroell · Mar 23, 2023 · c437dc8 · c437dc8
1 parent e6f1b85
commit c437dc8
Show file tree

Hide file tree

Showing 16 changed files with 1,117 additions and 4 deletions.
diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
@@ -22,7 +22,7 @@ jobs:
               run: pip install -r docs/requirements.txt
 
             - name: Install pertpy
-              run: pip install .[ete3]
+              run: pip install . ete3 pyqt5
 
             - name: Build docs
               run: |

diff --git a/.gitignore b/.gitignore
@@ -143,3 +143,6 @@ dmypy.json
 *.h5ad
 *.h5md
 *.h5mu
+
+# Apple stuff
+.DS_Store
diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks
diff --git a/docs/usage/usage.md b/docs/usage/usage.md
@@ -234,6 +234,40 @@ all_results = dl.multilevel_modeling(ct_subs=ct_subs,
                                      )
 ```
 
+### Distances and Permutation Tests
+
+General purpose functions for distances and permutation tests. Reimplements
+functions from [scperturb](http://projects.sanderlab.org/scperturb/) package.
+
+```{eval-rst}
+.. currentmodule:: pertpy
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: tools
+
+    tools.Distance
+    tools.DistanceTest
+```
+
+See [Distance tutorial](https://pertpy.readthedocs.io/en/latest/tutorials/notebooks/distances.html)
+and [Permutation test tutorial](https://pertpy.readthedocs.io/en/latest/tutorials/notebooks/distance_tests.html) for a more elaborate tutorial.
+
+```python
+import pertpy as pt
+
+adata = pt.dt.distance_example_data()
+
+# Pairwise distances
+distance = pt.tl.Distance(metric='edistance', obsm_key='X_pca')
+pairwise_edistance = distance.pairwise(adata, groupby='perturbation')
+
+# E-test (Permutation test using E-distance)
+etest = pt.tl.PermutationTest(metric='edistance', obsm_key='X_pca', correction='holm-sidak')
+tab = etest(adata, groupby='perturbation', contrast='control')
+```
+
 ### Representation
 
 ```{eval-rst}

diff --git a/pertpy/data/__init__.py b/pertpy/data/__init__.py
@@ -9,6 +9,7 @@
     datlinger_2017,
     datlinger_2021,
     dialogue_example,
+    distance_example_data,
     dixit_2016,
     dixit_2016_raw,
     dixit_2016_scperturb,

diff --git a/pertpy/data/_datasets.py b/pertpy/data/_datasets.py
@@ -373,6 +373,30 @@ def dialogue_example() -> AnnData:  # pragma: no cover
     return adata
 
 
+def distance_example_data() -> AnnData:  # pragma: no cover
+    """Example dataset used to feature distances and distance_tests.
+
+    Processed and subsetted version of original Perturb-seq dataset by Dixit et al.
+
+    Returns:
+        :class:`~anndata.AnnData` object
+    """
+    output_file_name = "distances_example_data.h5ad"
+    output_file_path = settings.datasetdir.__str__() + "/" + output_file_name
+    if not Path(output_file_path).exists():
+        _download(
+            url="https://figshare.com/ndownloader/files/39561379",
+            output_file_name=output_file_name,
+            output_path=settings.datasetdir,
+            is_zip=False,
+        )
+        adata = sc.read_h5ad(filename=settings.datasetdir.__str__() + "/" + output_file_name)
+    else:
+        adata = sc.read_h5ad(output_file_path)
+
+    return adata
+
+
 def kang_2018() -> AnnData:  # pragma: no cover
     """Processed multiplexing droplet-based single cell RNA-sequencing using genetic barcodes.
 

diff --git a/pertpy/tools/__init__.py b/pertpy/tools/__init__.py
@@ -3,6 +3,8 @@
 from pertpy.tools._augur import Augur
 from pertpy.tools._dialogue import Dialogue
 from pertpy.tools._differential_gene_expression import DifferentialGeneExpression
+from pertpy.tools._distances._distance_tests import DistanceTest
+from pertpy.tools._distances._distances import Distance
 from pertpy.tools._kernel_pca import kernel_pca
 from pertpy.tools._milo import Milo
 from pertpy.tools._mixscape import Mixscape

diff --git a/pertpy/tools/_distances/__init__.py b/pertpy/tools/_distances/__init__.py
diff --git a/pertpy/tools/_distances/_distance_tests.py b/pertpy/tools/_distances/_distance_tests.py
@@ -0,0 +1,276 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+from rich.progress import track
+from sklearn.metrics import pairwise_distances
+from statsmodels.stats.multitest import multipletests
+
+from ._distances import Distance
+
+
+class DistanceTest:
+    """Run permutation tests using a distance of choice between groups of cells.
+
+    Performs Monte-Carlo permutation testing using a distance metric of choice
+    as test statistic. Tests all groups of cells against a specified contrast
+    group (which normally would be your "control" cells).
+
+    Args:
+        metric: Distance metric to use.
+        n_perms: Number of permutations to run. (default: 1000)
+        obsm_key: Name of embedding to use for distance computation. (default: "X_pca")
+        alpha: Significance level. (default: 0.05)
+        correction: Multiple testing correction method. (default: "holm-sidak")
+
+    Example:
+        .. code-block:: python
+
+            import pertpy as pt
+
+            adata = pt.dt.distance_example_data()
+            etest = pt.tl.DistanceTest('edistance', n_perms=1000)
+            tab = etest(adata, groupby='perturbation', contrast='control')
+    """
+
+    def __init__(
+        self,
+        metric: str,
+        n_perms: int = 1000,
+        obsm_key: str = "X_pca",
+        alpha: float = 0.05,
+        correction: str = "holm-sidak",
+    ):
+        self.metric = metric
+        self.n_perms = n_perms
+        self.obsm_key = obsm_key
+        self.alpha = alpha
+        self.correction = correction
+
+    def __call__(
+        self, adata: AnnData, groupby: str, contrast: str, cell_wise_metric: str = "euclidean", verbose: bool = True
+    ) -> pd.DataFrame:
+        """Run a permutation test using the specified distance metric, testing
+        all groups of cells against a specified contrast group ("control").
+
+        Args:
+            adata: Annotated data matrix.
+            groupby: Key in adata.obs for grouping cells.
+            contrast: Name of the contrast group.
+            verbose: Whether to print progress. Defaults to True.
+
+        Returns:
+            pandas.DataFrame: Results of the permutation test, with columns:
+                - distance: distance between the contrast group and the group
+                - pvalue: p-value of the permutation test
+                - significant: whether the group is significantly different from the contrast group
+                - pvalue_adj: p-value after multiple testing correction
+                - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
+
+        Example:
+            .. code-block:: python
+
+                import pertpy as pt
+
+                adata = pt.dt.distance_example_data()
+                etest = pt.tl.DistanceTest('edistance', n_perms=1000)
+                tab = etest(adata, groupby='perturbation', contrast='control')
+        """
+        if Distance(self.metric, self.obsm_key).metric_fct.accepts_precomputed:
+            # Much faster if the metric can be called on the precomputed
+            # distance matrix, but not all metrics can do that.
+            return self.test_precomputed(adata, groupby, contrast, cell_wise_metric, verbose)
+        else:
+            return self.test_xy(adata, groupby, contrast, verbose)
+
+    def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = True) -> pd.DataFrame:
+        """Run permutation test for metric not supporting precomputed distances.
+
+        Runs a permutation test for a metric that can not be computed using
+        precomputed pairwise distances, but need the actual data points. This is
+        generally slower than test_precomputed.
+
+        Args:
+            adata: Annotated data matrix.
+            groupby: Key in adata.obs for grouping cells.
+            contrast: Name of the contrast group.
+            cell_wise_metric: Metric to use for pairwise distances. Defaults to "euclidean".
+            verbose: Whether to print progress. Defaults to True.
+
+        Returns:
+            pandas.DataFrame: Results of the permutation test, with columns:
+                - distance: distance between the contrast group and the group
+                - pvalue: p-value of the permutation test
+                - significant: whether the group is significantly different from the contrast group
+                - pvalue_adj: p-value after multiple testing correction
+                - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
+        """
+        distance = Distance(self.metric, self.obsm_key)
+        groups = adata.obs[groupby].unique()
+        if contrast not in groups:
+            raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
+        fct = track if verbose else lambda iterable: iterable
+        embedding = adata.obsm[self.obsm_key]
+
+        # Generate the null distribution
+        results = []
+        for _permutation in fct(range(self.n_perms)):
+            # per perturbation, shuffle with control and compute e-distance
+            df = pd.DataFrame(index=groups, columns=["distance"], dtype=float)
+            for group in groups:
+                if group == contrast:
+                    continue
+                # Shuffle the labels of the groups
+                mask = adata.obs[groupby].isin([group, contrast])
+                labels = adata.obs[groupby].values[mask]
+                shuffled_labels = np.random.permutation(labels)
+                idx = shuffled_labels == group
+
+                X = embedding[mask][idx]  # shuffled group
+                Y = embedding[mask][~idx]  # shuffled contrast
+                dist = distance(X, Y)
+
+                df.loc[group, "distance"] = dist
+            results.append(df.sort_index())
+
+        # Generate the empirical distribution
+        for group in groups:
+            if group == contrast:
+                continue
+            X = embedding[adata.obs[groupby] == group]
+            Y = embedding[adata.obs[groupby] == contrast]
+            df.loc[group, "distance"] = distance(X, Y)
+
+        # Evaluate the test
+        # count times shuffling resulted in larger distance
+        comparison_results = np.array(
+            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
+        )
+        n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
+        pvalues = n_failures / self.n_perms
+
+        # Apply multiple testing correction
+        significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction)
+
+        # Aggregate results
+        tab = pd.DataFrame(
+            {
+                "distance": df["distance"],
+                "pvalue": pvalues,
+                "significant": pvalues < self.alpha,
+                "pvalue_adj": pvalue_adj,
+                "significant_adj": significant_adj,
+            },
+            index=df.index,
+        )
+
+        # Set the contrast group
+        tab.loc[contrast, "distance"] = 0
+        tab.loc[contrast, "pvalue"] = 1
+        tab.loc[contrast, "significant"] = False
+        tab.loc[contrast, "pvalue_adj"] = 1
+        tab.loc[contrast, "significant_adj"] = False
+
+        return tab
+
+    def test_precomputed(
+        self, adata: AnnData, groupby: str, contrast: str, cell_wise_metric: str = "euclidean", verbose: bool = True
+    ) -> pd.DataFrame:
+        """Run permutation test for metrics that take precomputed distances.
+
+        Args:
+            adata: Annotated data matrix.
+            groupby: Key in adata.obs for grouping cells.
+            contrast: Name of the contrast group.
+            cell_wise_metric: Metric to use for pairwise distances.
+            verbose: Whether to print progress. (default: True)
+
+        Returns:
+            pandas.DataFrame: Results of the permutation test, with columns:
+                - distance: distance between the contrast group and the group
+                - pvalue: p-value of the permutation test
+                - significant: whether the group is significantly different from the contrast group
+                - pvalue_adj: p-value after multiple testing correction
+                - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
+        """
+
+        distance = Distance(self.metric, self.obsm_key)
+        if not distance.metric_fct.accepts_precomputed:
+            raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")
+
+        groups = adata.obs[groupby].unique()
+        if contrast not in groups:
+            raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
+        fct = track if verbose else lambda iterable: iterable
+
+        # Precompute the pairwise distances
+        precomputed_distances = {}
+        for group in groups:
+            cells = adata[adata.obs[groupby].isin([group, contrast])].obsm[self.obsm_key].copy()
+            pwd = pairwise_distances(cells, cells, metric=cell_wise_metric)
+            precomputed_distances[group] = pwd
+
+        # Generate the null distribution
+        results = []
+        for _permutation in fct(range(self.n_perms)):
+            # per perturbation, shuffle with control and compute e-distance
+            df = pd.DataFrame(index=groups, columns=["distance"], dtype=float)
+            for group in groups:
+                if group == contrast:
+                    continue
+                # Shuffle the labels of the groups
+                mask = adata.obs[groupby].isin([group, contrast])
+                labels = adata.obs[groupby].values[mask]
+                shuffled_labels = np.random.permutation(labels)
+                idx = shuffled_labels == group
+
+                precomputed_distance = precomputed_distances[group]
+                distance_result = distance.metric_fct.from_precomputed(precomputed_distance, idx)
+
+                df.loc[group, "distance"] = distance_result
+            results.append(df.sort_index())
+
+        # Generate the empirical distribution
+        for group in groups:
+            if group == contrast:
+                continue
+            mask = adata.obs[groupby].isin([group, contrast])
+            labels = adata.obs[groupby].values[mask]
+            idx = labels == group
+
+            precomputed_distance = precomputed_distances[group]
+            distance_result = distance.metric_fct.from_precomputed(precomputed_distance, idx)
+
+            df.loc[group, "distance"] = distance_result
+
+        # Evaluate the test
+        # count times shuffling resulted in larger distance
+        comparison_results = np.array(
+            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
+        )
+        n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
+        pvalues = n_failures / self.n_perms
+
+        # Apply multiple testing correction
+        significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction)
+
+        # Aggregate results
+        tab = pd.DataFrame(
+            {
+                "distance": df["distance"],
+                "pvalue": pvalues,
+                "significant": pvalues < self.alpha,
+                "pvalue_adj": pvalue_adj,
+                "significant_adj": significant_adj,
+            },
+            index=df.index,
+        )
+
+        # Set the contrast group
+        tab.loc[contrast, "distance"] = 0
+        tab.loc[contrast, "pvalue"] = 1
+        tab.loc[contrast, "significant"] = False
+        tab.loc[contrast, "pvalue_adj"] = 1
+        tab.loc[contrast, "significant_adj"] = False
+        return tab