Skip to content

Commit

Permalink
Feature/distances (scverse#217)
Browse files Browse the repository at this point in the history
* Initial commit
Adds tools/_distances.py

* First prototype framework of _distances.py

* Implement FastPermutationTest, Google docstrings

* Restructure distances.py
Split off PermutationTests into its own file

* Update to fit nox requirements

* Add tutorial notebooks for distance and perm tests

* Init tests for distances and PermutationTest

* Bugfix assertions in distance related tests

* Fixes for nox

* Add distances and test to usage.md

* Directly apply some suggestions from code review

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>

* Merge

* Add dataloader for distances_example_data.
h5ad

* Use distance example dataset in usage.md

* Resolve most review comments

* Docstrings are naturally inherited in python>=3.5
Subclass methods that overwrite a parent class method inherit the parent
method's class docstring.

* Store precomputed in adata, add linear MMD

* Add MMD and WassersteinDistance

* Batch-apply some suggestions from code review

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>

* Parallelize distance tests; add info to docstings

* Reformat code to comply with nox

* Move distances

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Fix docs

Signed-off-by: zethson <lukas.heumos@posteo.net>

* submodule update

Signed-off-by: zethson <lukas.heumos@posteo.net>

---------

Signed-off-by: zethson <lukas.heumos@posteo.net>
Co-authored-by: peidli <stefanpeidli@gmail.com>
  • Loading branch information
Zethson and stefanpeidli authored Mar 23, 2023
1 parent e6f1b85 commit c437dc8
Show file tree
Hide file tree
Showing 16 changed files with 1,117 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/publish_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
run: pip install -r docs/requirements.txt

- name: Install pertpy
run: pip install .[ete3]
run: pip install . ete3 pyqt5

- name: Build docs
run: |
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,6 @@ dmypy.json
*.h5ad
*.h5md
*.h5mu

# Apple stuff
.DS_Store
2 changes: 1 addition & 1 deletion docs/tutorials/notebooks
34 changes: 34 additions & 0 deletions docs/usage/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,40 @@ all_results = dl.multilevel_modeling(ct_subs=ct_subs,
)
```

### Distances and Permutation Tests

General purpose functions for distances and permutation tests. Reimplements
functions from [scperturb](http://projects.sanderlab.org/scperturb/) package.

```{eval-rst}
.. currentmodule:: pertpy
```

```{eval-rst}
.. autosummary::
:toctree: tools
tools.Distance
tools.DistanceTest
```

See [Distance tutorial](https://pertpy.readthedocs.io/en/latest/tutorials/notebooks/distances.html)
and [Permutation test tutorial](https://pertpy.readthedocs.io/en/latest/tutorials/notebooks/distance_tests.html) for a more elaborate tutorial.

```python
import pertpy as pt

adata = pt.dt.distance_example_data()

# Pairwise distances
distance = pt.tl.Distance(metric='edistance', obsm_key='X_pca')
pairwise_edistance = distance.pairwise(adata, groupby='perturbation')

# E-test (Permutation test using E-distance)
etest = pt.tl.PermutationTest(metric='edistance', obsm_key='X_pca', correction='holm-sidak')
tab = etest(adata, groupby='perturbation', contrast='control')
```

### Representation

```{eval-rst}
Expand Down
1 change: 1 addition & 0 deletions pertpy/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
datlinger_2017,
datlinger_2021,
dialogue_example,
distance_example_data,
dixit_2016,
dixit_2016_raw,
dixit_2016_scperturb,
Expand Down
24 changes: 24 additions & 0 deletions pertpy/data/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,30 @@ def dialogue_example() -> AnnData: # pragma: no cover
return adata


def distance_example_data() -> AnnData: # pragma: no cover
"""Example dataset used to feature distances and distance_tests.
Processed and subsetted version of original Perturb-seq dataset by Dixit et al.
Returns:
:class:`~anndata.AnnData` object
"""
output_file_name = "distances_example_data.h5ad"
output_file_path = settings.datasetdir.__str__() + "/" + output_file_name
if not Path(output_file_path).exists():
_download(
url="https://figshare.com/ndownloader/files/39561379",
output_file_name=output_file_name,
output_path=settings.datasetdir,
is_zip=False,
)
adata = sc.read_h5ad(filename=settings.datasetdir.__str__() + "/" + output_file_name)
else:
adata = sc.read_h5ad(output_file_path)

return adata


def kang_2018() -> AnnData: # pragma: no cover
"""Processed multiplexing droplet-based single cell RNA-sequencing using genetic barcodes.
Expand Down
2 changes: 2 additions & 0 deletions pertpy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from pertpy.tools._augur import Augur
from pertpy.tools._dialogue import Dialogue
from pertpy.tools._differential_gene_expression import DifferentialGeneExpression
from pertpy.tools._distances._distance_tests import DistanceTest
from pertpy.tools._distances._distances import Distance
from pertpy.tools._kernel_pca import kernel_pca
from pertpy.tools._milo import Milo
from pertpy.tools._mixscape import Mixscape
Expand Down
Empty file.
276 changes: 276 additions & 0 deletions pertpy/tools/_distances/_distance_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
from __future__ import annotations

import numpy as np
import pandas as pd
from anndata import AnnData
from rich.progress import track
from sklearn.metrics import pairwise_distances
from statsmodels.stats.multitest import multipletests

from ._distances import Distance


class DistanceTest:
"""Run permutation tests using a distance of choice between groups of cells.
Performs Monte-Carlo permutation testing using a distance metric of choice
as test statistic. Tests all groups of cells against a specified contrast
group (which normally would be your "control" cells).
Args:
metric: Distance metric to use.
n_perms: Number of permutations to run. (default: 1000)
obsm_key: Name of embedding to use for distance computation. (default: "X_pca")
alpha: Significance level. (default: 0.05)
correction: Multiple testing correction method. (default: "holm-sidak")
Example:
.. code-block:: python
import pertpy as pt
adata = pt.dt.distance_example_data()
etest = pt.tl.DistanceTest('edistance', n_perms=1000)
tab = etest(adata, groupby='perturbation', contrast='control')
"""

def __init__(
self,
metric: str,
n_perms: int = 1000,
obsm_key: str = "X_pca",
alpha: float = 0.05,
correction: str = "holm-sidak",
):
self.metric = metric
self.n_perms = n_perms
self.obsm_key = obsm_key
self.alpha = alpha
self.correction = correction

def __call__(
self, adata: AnnData, groupby: str, contrast: str, cell_wise_metric: str = "euclidean", verbose: bool = True
) -> pd.DataFrame:
"""Run a permutation test using the specified distance metric, testing
all groups of cells against a specified contrast group ("control").
Args:
adata: Annotated data matrix.
groupby: Key in adata.obs for grouping cells.
contrast: Name of the contrast group.
verbose: Whether to print progress. Defaults to True.
Returns:
pandas.DataFrame: Results of the permutation test, with columns:
- distance: distance between the contrast group and the group
- pvalue: p-value of the permutation test
- significant: whether the group is significantly different from the contrast group
- pvalue_adj: p-value after multiple testing correction
- significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
Example:
.. code-block:: python
import pertpy as pt
adata = pt.dt.distance_example_data()
etest = pt.tl.DistanceTest('edistance', n_perms=1000)
tab = etest(adata, groupby='perturbation', contrast='control')
"""
if Distance(self.metric, self.obsm_key).metric_fct.accepts_precomputed:
# Much faster if the metric can be called on the precomputed
# distance matrix, but not all metrics can do that.
return self.test_precomputed(adata, groupby, contrast, cell_wise_metric, verbose)
else:
return self.test_xy(adata, groupby, contrast, verbose)

def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = True) -> pd.DataFrame:
"""Run permutation test for metric not supporting precomputed distances.
Runs a permutation test for a metric that can not be computed using
precomputed pairwise distances, but need the actual data points. This is
generally slower than test_precomputed.
Args:
adata: Annotated data matrix.
groupby: Key in adata.obs for grouping cells.
contrast: Name of the contrast group.
cell_wise_metric: Metric to use for pairwise distances. Defaults to "euclidean".
verbose: Whether to print progress. Defaults to True.
Returns:
pandas.DataFrame: Results of the permutation test, with columns:
- distance: distance between the contrast group and the group
- pvalue: p-value of the permutation test
- significant: whether the group is significantly different from the contrast group
- pvalue_adj: p-value after multiple testing correction
- significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
"""
distance = Distance(self.metric, self.obsm_key)
groups = adata.obs[groupby].unique()
if contrast not in groups:
raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
fct = track if verbose else lambda iterable: iterable
embedding = adata.obsm[self.obsm_key]

# Generate the null distribution
results = []
for _permutation in fct(range(self.n_perms)):
# per perturbation, shuffle with control and compute e-distance
df = pd.DataFrame(index=groups, columns=["distance"], dtype=float)
for group in groups:
if group == contrast:
continue
# Shuffle the labels of the groups
mask = adata.obs[groupby].isin([group, contrast])
labels = adata.obs[groupby].values[mask]
shuffled_labels = np.random.permutation(labels)
idx = shuffled_labels == group

X = embedding[mask][idx] # shuffled group
Y = embedding[mask][~idx] # shuffled contrast
dist = distance(X, Y)

df.loc[group, "distance"] = dist
results.append(df.sort_index())

# Generate the empirical distribution
for group in groups:
if group == contrast:
continue
X = embedding[adata.obs[groupby] == group]
Y = embedding[adata.obs[groupby] == contrast]
df.loc[group, "distance"] = distance(X, Y)

# Evaluate the test
# count times shuffling resulted in larger distance
comparison_results = np.array(
pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
)
n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
pvalues = n_failures / self.n_perms

# Apply multiple testing correction
significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction)

# Aggregate results
tab = pd.DataFrame(
{
"distance": df["distance"],
"pvalue": pvalues,
"significant": pvalues < self.alpha,
"pvalue_adj": pvalue_adj,
"significant_adj": significant_adj,
},
index=df.index,
)

# Set the contrast group
tab.loc[contrast, "distance"] = 0
tab.loc[contrast, "pvalue"] = 1
tab.loc[contrast, "significant"] = False
tab.loc[contrast, "pvalue_adj"] = 1
tab.loc[contrast, "significant_adj"] = False

return tab

def test_precomputed(
self, adata: AnnData, groupby: str, contrast: str, cell_wise_metric: str = "euclidean", verbose: bool = True
) -> pd.DataFrame:
"""Run permutation test for metrics that take precomputed distances.
Args:
adata: Annotated data matrix.
groupby: Key in adata.obs for grouping cells.
contrast: Name of the contrast group.
cell_wise_metric: Metric to use for pairwise distances.
verbose: Whether to print progress. (default: True)
Returns:
pandas.DataFrame: Results of the permutation test, with columns:
- distance: distance between the contrast group and the group
- pvalue: p-value of the permutation test
- significant: whether the group is significantly different from the contrast group
- pvalue_adj: p-value after multiple testing correction
- significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
"""

distance = Distance(self.metric, self.obsm_key)
if not distance.metric_fct.accepts_precomputed:
raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")

groups = adata.obs[groupby].unique()
if contrast not in groups:
raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
fct = track if verbose else lambda iterable: iterable

# Precompute the pairwise distances
precomputed_distances = {}
for group in groups:
cells = adata[adata.obs[groupby].isin([group, contrast])].obsm[self.obsm_key].copy()
pwd = pairwise_distances(cells, cells, metric=cell_wise_metric)
precomputed_distances[group] = pwd

# Generate the null distribution
results = []
for _permutation in fct(range(self.n_perms)):
# per perturbation, shuffle with control and compute e-distance
df = pd.DataFrame(index=groups, columns=["distance"], dtype=float)
for group in groups:
if group == contrast:
continue
# Shuffle the labels of the groups
mask = adata.obs[groupby].isin([group, contrast])
labels = adata.obs[groupby].values[mask]
shuffled_labels = np.random.permutation(labels)
idx = shuffled_labels == group

precomputed_distance = precomputed_distances[group]
distance_result = distance.metric_fct.from_precomputed(precomputed_distance, idx)

df.loc[group, "distance"] = distance_result
results.append(df.sort_index())

# Generate the empirical distribution
for group in groups:
if group == contrast:
continue
mask = adata.obs[groupby].isin([group, contrast])
labels = adata.obs[groupby].values[mask]
idx = labels == group

precomputed_distance = precomputed_distances[group]
distance_result = distance.metric_fct.from_precomputed(precomputed_distance, idx)

df.loc[group, "distance"] = distance_result

# Evaluate the test
# count times shuffling resulted in larger distance
comparison_results = np.array(
pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
)
n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
pvalues = n_failures / self.n_perms

# Apply multiple testing correction
significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction)

# Aggregate results
tab = pd.DataFrame(
{
"distance": df["distance"],
"pvalue": pvalues,
"significant": pvalues < self.alpha,
"pvalue_adj": pvalue_adj,
"significant_adj": significant_adj,
},
index=df.index,
)

# Set the contrast group
tab.loc[contrast, "distance"] = 0
tab.loc[contrast, "pvalue"] = 1
tab.loc[contrast, "significant"] = False
tab.loc[contrast, "pvalue_adj"] = 1
tab.loc[contrast, "significant_adj"] = False
return tab
Loading

0 comments on commit c437dc8

Please sign in to comment.