Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

75 too high freqs #76

Merged
merged 6 commits into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion covigator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "v1.1.1"
VERSION = "v1.1.2"
ANALYSIS_PIPELINE_VERSION = "v0.15.0"

MISSENSE_VARIANT = "missense_variant"
Expand Down
27 changes: 17 additions & 10 deletions covigator/database/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
from logzero import logger
import sqlalchemy
from sqlalchemy import and_, desc, asc, func, String, DateTime
from sqlalchemy import and_, desc, asc, func, String, DateTime, text
from sqlalchemy.engine.default import DefaultDialect
from sqlalchemy.orm import Session, aliased
from sqlalchemy.sql.sqltypes import NullType
Expand Down Expand Up @@ -467,13 +467,16 @@ def get_last_update(self, data_source: DataSource) -> date:
def get_variant_counts_by_month(self, variant_id, source: str) -> pd.DataFrame:

klass = self.get_variant_observation_klass(source=source)
sample_klass = self.get_sample_klass(source=source)
sql_query_ds_ena = """
select count(*) as count, variant_id, date_trunc('month', date::timestamp) as month
from {variant_observation_table}
where variant_id='{variant_id}'
where variant_id='{variant_id}'
and sample in (select run_accession from {sample_table} where status='FINISHED')
group by variant_id, date_trunc('month', date::timestamp);
""".format(
variant_observation_table=klass.__tablename__,
sample_table=sample_klass.__tablename__,
variant_id=variant_id
)
data = pd.read_sql_query(sql_query_ds_ena, self.session.bind)
Expand All @@ -482,12 +485,18 @@ def get_variant_counts_by_month(self, variant_id, source: str) -> pd.DataFrame:

def get_sample_counts_by_month(self, source: str) -> pd.DataFrame:
klass = self.get_sample_klass(source=source)
query = self.session.query(
func.date_trunc('month', klass.collection_date).label("month"),
func.count().label("sample_count"))\
.filter(klass.status == JobStatus.FINISHED.name) \
.group_by(func.date_trunc('month', klass.collection_date))
counts = pd.read_sql(query.statement, self.session.bind)
# NOTE: this query was originally implemented with SQLAlchemy syntax, but the func.date_trunc function
# provides different results. Do not change back!
query = """
select date_trunc('month', collection_date::timestamp) as month,
count(*) as sample_count
from {table}
where status='FINISHED'
group by date_trunc('month', collection_date::timestamp);
""".format(
table=klass.__tablename__
)
counts = pd.read_sql(text(query), self.session.bind)
counts['month'] = pd.to_datetime(counts['month'], utc=True)
return counts

Expand Down Expand Up @@ -550,8 +559,6 @@ def get_top_occurring_variants_precomputed(

# formats the DNA mutation
top_occurring_variants.rename(columns={'variant_id': 'dna_mutation'}, inplace=True)
top_occurring_variants["frequency_by_month"] = top_occurring_variants.frequency

# pivots the table over months
top_occurring_variants = pd.pivot_table(
top_occurring_variants, index=['gene_name', 'dna_mutation', 'hgvs_p', 'annotation', "frequency", "total"],
Expand Down
9 changes: 6 additions & 3 deletions covigator/precomputations/load_top_occurrences.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pandas as pd
from logzero import logger
from sqlalchemy import func, desc
from sqlalchemy import func, desc, and_
from sqlalchemy.orm import Session
from covigator import SYNONYMOUS_VARIANT
from covigator.database.model import DataSource, PrecomputedOccurrence
from covigator.database.model import DataSource, PrecomputedOccurrence, JobStatus
from covigator.database.queries import Queries


Expand Down Expand Up @@ -72,11 +72,14 @@ def _row_to_top_occurrence(self, row, source):
)

def get_top_occurring_variants(self, top, source: str):
sample_klass = self.queries.get_sample_klass(source=source)
subquery = self.session.query(sample_klass.run_accession).filter(sample_klass.status == JobStatus.FINISHED)

klass = self.queries.get_variant_observation_klass(source)
query = self.session.query(
klass.variant_id, klass.hgvs_p, klass.gene_name, klass.pfam_name,
klass.annotation_highest_impact, func.count().label('total')) \
.filter(klass.annotation_highest_impact != SYNONYMOUS_VARIANT)
.filter(and_(klass.annotation_highest_impact != SYNONYMOUS_VARIANT, klass.sample.in_(subquery)))
query = query.group_by(klass.variant_id, klass.hgvs_p, klass.gene_name,
klass.pfam_name, klass.annotation_highest_impact) \
.order_by(desc('total')).limit(top)
Expand Down
30 changes: 18 additions & 12 deletions covigator/tests/unit_tests/mocked.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from covigator import MISSENSE_VARIANT, SYNONYMOUS_VARIANT, INFRAME_INSERTION, INFRAME_DELETION
from covigator.database.model import SampleEna, DataSource, JobStatus, Log, CovigatorModule, Variant, \
VariantCooccurrence, VariantType
VariantCooccurrence, VariantType, SampleCovid19Portal, VariantCovid19Portal
from Bio.Alphabet.IUPAC import IUPACData

from covigator.database.queries import Queries
Expand Down Expand Up @@ -59,10 +59,10 @@ def get_mocked_variant(faker: Faker, chromosome=None, gene_name=None, source=Dat


def get_mocked_variant_observation(
sample: Union[SampleEna], variant: Union[Variant], faker=Faker()):
sample: Union[SampleEna, SampleCovid19Portal], variant: Union[Variant, VariantCovid19Portal], faker=Faker()):

klass = Queries.get_variant_observation_klass(
DataSource.ENA.name if isinstance(sample, SampleEna) else DataSource.GISAID.name)
DataSource.ENA.name if isinstance(sample, SampleEna) else DataSource.COVID19_PORTAL.name)
return klass(
sample=sample.run_accession if sample else faker.unique.uuid4(),
variant_id=variant.variant_id,
Expand Down Expand Up @@ -130,19 +130,25 @@ def get_mocked_variant_cooccurrence(faker: Faker, variant_one: Variant, variant_
return cooccurrence


def mock_samples_and_variants(faker, session: Session, num_samples=10):
existing_variants = {DataSource.ENA.name: set(), DataSource.GISAID.name: set()}
samples = mock_samples(faker=faker, session=session, num_samples=num_samples, source=DataSource.ENA)
for sample in samples:
source = DataSource.ENA
variants = [get_mocked_variant(faker=faker, source=source.name, session=session) for _ in range(10)]
# this aims at removing potentially repeated variants
def mock_samples_and_variants(faker, session: Session, num_samples=10, source = DataSource.ENA):

existing_variants = set()
samples = mock_samples(faker=faker, session=session, num_samples=num_samples, source=source)
# introduce some not finished samples, which happen to have variants too...
failed_samples = mock_samples(faker=faker, session=session, num_samples=num_samples, source=source,
job_status=JobStatus.FAILED_PROCESSING)
# introduce a variant that is shared by all samples (eg: like 23403:A>G)
shared_variant = get_mocked_variant(faker=faker, source=source.name, session=session)
for sample in samples + failed_samples:
variants = [get_mocked_variant(faker=faker, source=source.name, session=session) for _ in range(9)] + \
[shared_variant]
# NOTE: this aims at removing potentially repeated variants
variants_dict = {v.variant_id: v for v in variants}
variants = variants_dict.values()
new_variants = list(filter(lambda x: x.variant_id not in existing_variants.get(source.name), variants))
new_variants = list(filter(lambda x: x.variant_id not in existing_variants, variants))
session.add_all(new_variants)
session.commit()
existing_variants.get(source.name).update([v.variant_id for v in variants])
existing_variants.update([v.variant_id for v in variants])

variants_observations = [get_mocked_variant_observation(faker=faker, variant=v, sample=sample)
for v in variants]
Expand Down
16 changes: 7 additions & 9 deletions covigator/tests/unit_tests/test_precomputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from covigator.database.model import PrecomputedSynonymousNonSynonymousCounts, RegionType, DataSource, \
PrecomputedOccurrence, PrecomputedVariantsPerSample, PrecomputedSubstitutionsCounts, PrecomputedIndelLength, \
PrecomputedAnnotation, PrecomputedVariantAbundanceHistogram, PrecomputedVariantsPerLineage, VariantCooccurrence
PrecomputedAnnotation, PrecomputedVariantAbundanceHistogram, PrecomputedVariantsPerLineage
from covigator.precomputations.load_cooccurrences import CooccurrenceMatrixLoader
from covigator.precomputations.load_ns_s_counts import NsSCountsLoader
from covigator.precomputations.load_top_occurrences import TopOccurrencesLoader
Expand All @@ -16,14 +16,11 @@
class TestPrecomputer(AbstractTest):

def setUp(self) -> None:
mock_samples_and_variants(session=self.session, faker=self.faker, num_samples=100)
self.ns_counts_loader = NsSCountsLoader(session=self.session)
self.top_occurrences_loader = TopOccurrencesLoader(session=self.session)
mock_samples_and_variants(session=self.session, faker=self.faker, num_samples=100, source=DataSource.ENA)
self.precomputations_loader = PrecomputationsLoader(session=self.session)
self.precomputations_lineage = VariantsPerLineageLoader(session=self.session)

def test_load_dn_ds(self):
self.ns_counts_loader.load()
NsSCountsLoader(session=self.session).load()
for g in MOCKED_GENES:
self.assertGreater(
self.session.query(PrecomputedSynonymousNonSynonymousCounts).filter(
Expand Down Expand Up @@ -79,17 +76,18 @@ def test_load_dn_ds(self):
PrecomputedSynonymousNonSynonymousCounts.region_name == d)).count(),
0)

def test_load_table_counts(self):
def test_load_top_occurrent_mutations(self):
self.assertEqual(self.session.query(PrecomputedOccurrence).count(), 0)
self.precomputations_loader.load_table_counts() # table counts precomputations are needed
self.top_occurrences_loader.load()
TopOccurrencesLoader(session=self.session).load()
self.assertGreater(self.session.query(PrecomputedOccurrence).count(), 0)
for g in MOCKED_GENES:
occurrences = self.session.query(PrecomputedOccurrence).filter(PrecomputedOccurrence.gene_name == g).all()
self.assertGreater(len(occurrences), 0)
for o in occurrences:
self.assertGreater(o.total, 0)
self.assertGreater(o.frequency, 0.0)
self.assertLessEqual(o.frequency, 1.0)
self.assertIsNotNone(o.variant_id)
self.assertIsNotNone(o.gene_name)
self.assertIsNotNone(o.domain)
Expand Down Expand Up @@ -154,7 +152,7 @@ def test_load_variant_abundance_histogram(self):

def test_load_variants_per_lineage(self):
self.assertEqual(self.session.query(PrecomputedVariantsPerLineage).count(), 0)
self.precomputations_lineage.load()
VariantsPerLineageLoader(session=self.session).load()
self.assertGreater(self.session.query(PrecomputedVariantsPerLineage).count(), 0)
for p in self.session.query(PrecomputedVariantsPerLineage).all():
self.assertGreater(p.count_observations, 0)
Expand Down