Skip to content

Commit

Permalink
Various fixes and improvements. Added SpecIA and implemented missing …
Browse files Browse the repository at this point in the history
…features
  • Loading branch information
mmtechslv committed Apr 23, 2022
1 parent bf765d2 commit b3a2f04
Show file tree
Hide file tree
Showing 14 changed files with 180 additions and 23 deletions.
5 changes: 4 additions & 1 deletion pmaf/biome/essentials/_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from pmaf.biome.essentials._metakit import (
EssentialFeatureMetabase,
EssentialSampleMetabase,
EssentialBackboneMetabase,
EssentialBackboneMetabase
)
from pmaf.biome.essentials._tree import RepPhylogeny
import numpy as np
from typing import List, Any, Optional
from pmaf.internal._typing import AnyGenericIdentifier
Expand Down Expand Up @@ -140,6 +141,8 @@ def __append_essential(self, essential: EssentialBackboneMetabase) -> None:
else:
raise ValueError("Essentials must have same ids at both axes.")
if essential_pass:
if isinstance(essential, RepPhylogeny):
essential._patch_xrid_dtype(self.__feature_ids.dtype)
essential._mount_controller(self)
essential._buckle()
self.__essentials.append(essential)
Expand Down
5 changes: 4 additions & 1 deletion pmaf/biome/essentials/_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def copy(self) -> "RepPhylogeny":
"""Copy of the instance."""
return type(self)(
tree=self.__internal_tree,
feature_ids=self.__feature_ids,
feature_ids=self.xrid,
annotation=self.__annotations,
copy=True,
metadata=self.metadata,
Expand Down Expand Up @@ -334,6 +334,9 @@ def get_subset(
name=self.name,
)

def _patch_xrid_dtype(self, dtype):
self.__feature_ids_dtype = dtype

def write(self, output_fp: str, mode: str = "w", **kwargs: Any) -> None:
"""Writes the Newick tree into specified file.
Expand Down
4 changes: 2 additions & 2 deletions pmaf/database/_core/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
warnings.simplefilter("ignore", category=FutureWarning)
from pmaf.database._metakit import DatabaseBackboneMetabase
from pmaf.database._manager import DatabaseStorageManager
from pmaf.internal._shared import get_rank_upto
from pmaf.internal._shared import get_rank_upto, sort_ranks
from pmaf.database._shared._common import to_mode
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -431,7 +431,7 @@ def find_tid_by_rid(
else:
target_ranks = np.asarray(levels)

target_unique_ranks = np.unique(target_ranks)
target_unique_ranks = np.asarray(sort_ranks(np.unique(target_ranks)))
target_unique_ids = np.unique(target_ids)
if self.xrid.isin(target_unique_ids).sum() == len(target_unique_ids):
map2tid = self.storage_manager.retrieve_data_by_element("map-rep2tid")
Expand Down
30 changes: 22 additions & 8 deletions pmaf/database/_manifest/_otl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import typing
from os import path
from pmaf.database._core._base import DatabaseBase
from pmaf.database._core._tax_base import DatabaseTaxonomyMixin
Expand All @@ -10,7 +11,22 @@
import numpy as np
from typing import Any, Tuple
from pmaf.internal._typing import AnyGenericIdentifier

from pmaf.database._parsers._phylo import read_newick_tree
from ete3 import Tree
import re

regex_mrca_tags = re.compile("mrcaott[0-9]+ott[0-9]+")
regex_ott_tags = re.compile("ott([0-9]+)")

def get_tree_leafs(tree_newick_fp):
tree_str = read_newick_tree(tree_newick_fp)
newick_string_no_mrca_tag = re.sub(regex_mrca_tags, "", tree_str)
newick_string_parsed = re.sub(
regex_ott_tags, "\\1", newick_string_no_mrca_tag
)
tmp_tree = Tree(newick_string_parsed, format=8)
nodes_with_names = [node.name for node in tmp_tree.iter_leaves() if node.name != ""]
return nodes_with_names

class DatabaseOTL(
DatabaseTaxonomyMixin, DatabasePhylogenyMixin, DatabaseAccessionMixin, DatabaseBase
Expand Down Expand Up @@ -80,9 +96,10 @@ def build_database_storage(
storage_name=cls.DATABASE_NAME,
force_new=force,
)
tax_ids_in_tree = get_tree_leafs(tree_newick_fp)

removed_rids, novel_tids, index_mapper, tmp_recap = cls.__process_tax_acs_map(
tmp_storage_manager, taxonomy_map_csv_fp, delimiter
tmp_storage_manager, taxonomy_map_csv_fp, delimiter, tax_ids_in_tree
)
cls.__process_tree(tmp_storage_manager, tree_newick_fp, index_mapper)

Expand All @@ -102,6 +119,7 @@ def __process_tax_acs_map(
storage_manager: DatabaseStorageManager,
taxonomy_map_csv_fp: str,
delimiter: str,
tax_ids_in_tree: typing.List[str]
) -> Tuple[AnyGenericIdentifier, np.ndarray, pd.Series, pd.Series]:
from pmaf.internal._extensions import cython_functions
from pmaf.internal._constants import VALID_RANKS
Expand Down Expand Up @@ -223,6 +241,8 @@ def produce_map_rep2tid(transformation_details):
tmp_taxonomy_sheet_master.index = tmp_taxonomy_sheet_master.index.astype(
full_taxonomy_map.index.dtype
)
valid_tax_ids = tmp_taxonomy_sheet_master.index[tmp_taxonomy_sheet_master.index.isin(tax_ids_in_tree)]
tmp_taxonomy_sheet_master = tmp_taxonomy_sheet_master.loc[valid_tax_ids]
index_mapper = transformer.make_rid_index_mapper(
tmp_taxonomy_sheet_master.index
)
Expand Down Expand Up @@ -287,12 +307,6 @@ def __process_tree(
index_mapper
Index renamer/mapper
"""
from pmaf.database._parsers._phylo import read_newick_tree
from ete3 import Tree
import re

regex_mrca_tags = re.compile("mrcaott[0-9]+ott[0-9]+")
regex_ott_tags = re.compile("ott([0-9]+)")

def produce_tree_prior(tree_newick_fp):
yield None, None
Expand Down
35 changes: 32 additions & 3 deletions pmaf/pipe/agents/mediators/_local/_components/_acs_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from pmaf.pipe.agents.dockers._metakit import DockerIdentifierMetabase
from pmaf.pipe.factors._base import FactorBase
import numpy as np
import pandas as pd
import itertools
from collections import defaultdict
from typing import Optional, Any

Expand Down Expand Up @@ -168,7 +170,7 @@ def __filter_rids_from_tids_accessions(self, accs_dict):
ret = {k: tuple(v) for k, v in tmp_accs_dict.items()}
return ret, tmp_metadata_dict

def get_identifier_by_accession(self, docker, factor, **kwargs): # TODO: This method is not yet implemented so fix it.
def get_identifier_by_accession(self, docker, factor, **kwargs):
"""Get local database identifiers that match target accession numbers
in `docker` within local database client.
Expand All @@ -187,6 +189,33 @@ def get_identifier_by_accession(self, docker, factor, **kwargs): # TODO: This me
-------
An instance of :class:`.DockerIdentifierMedium` with matching accessions.
THIS METHOD IS CURRENTLY NOT IMPLEMENTED
"""
raise NotImplementedError
db_acs_df = pd.DataFrame.from_dict(self.client.get_accession_by_rid(iterator=False), orient="index")
acs_no_empty = docker.get_subset(exclude_missing=True)
compatible_mask = db_acs_df.columns.isin(acs_no_empty.sources)
if compatible_mask.sum() >= 1:
selected_acs_src = db_acs_df.columns[compatible_mask][0] # First one is client's
else:
raise ValueError("No compatible accession sources were found")
db_acs_df_full = db_acs_df[selected_acs_src]
db_acs_df_major = db_acs_df_full.str.extract("^(.*?)\..*$").iloc[:, 0]
flat_target_acs = pd.Index([ac for ac in set([ac for acm in acs_no_empty.data.values() for ac in (acm.get(selected_acs_src,[]) if isinstance(acm, dict) else [])]) if ac is not None])
db_full_matches = db_acs_df_full[db_acs_df_full.isin(flat_target_acs)]
db_major_matches = db_acs_df_major[db_acs_df_major.isin(flat_target_acs)]
db_major_matches = db_major_matches[~db_major_matches.index.isin(db_full_matches.index)] # Exclude matches from full_matches
ref_matches = db_full_matches.append(db_major_matches, verify_integrity=True)
acs_as_id = acs_no_empty.to_identifier_by_src(selected_acs_src)
db_id_dict = {}
for ix, acid in acs_as_id.get_iterator():
acid_acs_flat = acid.to_array(exclude_missing=True, unique=True)
acid_acs_rids = ref_matches.index[ref_matches.isin(acid_acs_flat)].tolist()
acid_acs_tids_df = self.client.find_tid_by_rid(acid_acs_rids, method='legal')
acid_acs_tids_last_valids = acid_acs_tids_df.map(lambda x: x[-1]).astype(int).unique()
db_id_dict[ix] = DockerIdentifierMedium(acid_acs_tids_last_valids)
return DockerIdentifierMedium(db_id_dict)






17 changes: 17 additions & 0 deletions pmaf/pipe/agents/miners/_metakit.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,23 @@ def yield_accession_by_identifier(self, docker, **kwargs):
**kwargs :
Returns
-------
"""
pass

@abstractmethod
def yield_identifier_by_accession(self, docker, **kwargs):
"""
Parameters
----------
docker :
**kwargs :
Returns
-------
Expand Down
26 changes: 26 additions & 0 deletions pmaf/pipe/agents/miners/_miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,32 @@ def yield_sequence_by_identifier(
else:
raise TypeError("`docker` must be instance of DockerIdentifierMetabase.")

def yield_identifier_by_accession(
self, docker: DockerAccessionMedium, **kwargs: Any
) -> DockerIdentifierMedium:
"""Yield the next identifier :term:`docker` from given accessions
parameter `docker`
Parameters
----------
docker
The input accession :term:`docker`
kwargs
Compatibility
Yields
------
Identifier :term:`docker`
"""
if isinstance(docker, DockerAccessionMetabase):
if isinstance(self.mediator, MediatorAccessionMetabase):
yield from self.__yield_identifier_by_accession(docker, **kwargs)
else:
raise RuntimeError("`mediator` does not support such request.")
else:
raise TypeError("`docker` must be instance of DockerIdentifierMetabase.")

def yield_accession_by_identifier(
self, docker: DockerIdentifierMedium, **kwargs: Any
) -> DockerAccessionMedium:
Expand Down
1 change: 1 addition & 0 deletions pmaf/pipe/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@
"SpecTI",
"SpecIT",
"SpecIA",
"SpecAI"
]
8 changes: 6 additions & 2 deletions pmaf/pipe/specs/_inventory/_composite/_forge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pmaf.pipe.specs._metakit import SpecificationBackboneMetabase


def ForgeSpec(name: str, *inters: SpecificationBackboneMetabase) -> SpecificationBackboneMetabase:
def ForgeSpec(name: str, *inters: SpecificationBackboneMetabase) -> type:
"""Function to forge a new :term:`spec` from intermediate
:term:`specs<spec>`
Expand Down Expand Up @@ -49,6 +49,10 @@ def __init__(self, *args, **kwargs):
raise RuntimeError("Forged specification is incorrect.")
else:
tmp_steps.append((inter.__name__, inter, last_outlet, str(inter)))
self._inlet = tmp_specs[0].inlet
self._outlet = tmp_specs[-1].outlet
SpecificationCompositeBase.__init__(self, _specs=tmp_specs, _steps=tmp_steps)

return type(name, (SpecificationCompositeBase,), {"__init__": __init__})
return type(name, (SpecificationCompositeBase,), {"__init__": __init__,
"inlet": property(lambda self: self._inlet),
"outlet": property(lambda self: self._outlet)})
3 changes: 2 additions & 1 deletion pmaf/pipe/specs/_inventory/_primitive/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
from ._ti import SpecTI
from ._it import SpecIT
from ._ia import SpecIA
from ._ai import SpecAI

__all__ = ['SpecIP', 'SpecIS', 'SpecTI', 'SpecIT', 'SpecIA']
__all__ = ['SpecIP', 'SpecIS', 'SpecTI', 'SpecIT', 'SpecIA', 'SpecAI']
50 changes: 50 additions & 0 deletions pmaf/pipe/specs/_inventory/_primitive/_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from ._base import SpecificationPrimitiveBase
from pmaf.pipe.agents.mediators._metakit import MediatorAccessionMetabase
from pmaf.pipe.factors._metakit import FactorBackboneMetabase
from pmaf.pipe.agents.miners._miner import Miner
from pmaf.pipe.agents.dockers._mediums._id_medium import DockerIdentifierMedium
from pmaf.pipe.agents.dockers._mediums._acs_medium import DockerAccessionMedium
from pmaf.pipe.agents.dockers._metakit import DockerBackboneMetabase

class SpecAI(SpecificationPrimitiveBase):
"""Accessions -> Identifiers ."""
def __init__(self, mediator, factor, **kwargs):
if not isinstance(mediator, MediatorAccessionMetabase):
raise TypeError('`mediator` must be instance of MediatorAccessionMetabase.')
if isinstance(factor, FactorBackboneMetabase):
if not mediator.verify_factor(factor):
raise ValueError('`factor` is not supported by database.')
else:
raise TypeError('`factor` has invalid type.')
tmp_miner = Miner(mediator=mediator,factor=factor,**kwargs)
tmp_steps = self.__define_lazy_steps()
super().__init__(_steps=tmp_steps,_miner=tmp_miner)

def __define_lazy_steps(self):
steps_dict = [('verify-input', self.__checkpoint_verify_input, DockerAccessionMedium, 'Verify Input.'),
('accession-to-identifier', self.__checkpoint_accession_to_identifier, DockerIdentifierMedium, 'Retrieve identifiers by accessions .')]
return steps_dict

def __checkpoint_verify_input(self, input, *args,**kwargs):
if not isinstance(input, DockerBackboneMetabase):
tmp_docker = DockerAccessionMedium(input, **kwargs)
else:
tmp_docker = input
if self.miner.verify_docker(tmp_docker):
return tmp_docker, args, kwargs
else:
raise ValueError('`docker` is not supported by current specification.')

def __checkpoint_accession_to_identifier(self, docker, *args, **kwargs):
identifiers = next(self.miner.yield_identifier_by_accession(docker, **kwargs))
return identifiers, args, kwargs

@property
def inlet(self):
""":class:`.DockerIdentifierMedium`"""
return DockerAccessionMedium

@property
def outlet(self):
""":class:`.DockerAccessionMedium`"""
return DockerIdentifierMedium
9 changes: 4 additions & 5 deletions pmaf/pipe/specs/_inventory/_primitive/_ia.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pmaf.pipe.agents.miners._miner import Miner
from pmaf.pipe.agents.dockers._mediums._id_medium import DockerIdentifierMedium
from pmaf.pipe.agents.dockers._mediums._acs_medium import DockerAccessionMedium
from pmaf.pipe.agents.dockers._metakit import DockerIdentifierMetabase,DockerBackboneMetabase
from pmaf.pipe.agents.dockers._metakit import DockerBackboneMetabase

class SpecIA(SpecificationPrimitiveBase):
"""Identifiers -> Accessions."""
Expand All @@ -25,21 +25,20 @@ def __define_lazy_steps(self):
('identifier-to-accession', self.__checkpoint_identifier_to_accession, DockerAccessionMedium, 'Retrieve accessions by identifiers.')]
return steps_dict

def __checkpoint_verify_input(self,input,*args,**kwargs):
def __checkpoint_verify_input(self, input, *args,**kwargs):
if not isinstance(input, DockerBackboneMetabase):
tmp_docker = DockerIdentifierMedium(input,**kwargs)
tmp_docker = DockerIdentifierMedium(input, **kwargs)
else:
tmp_docker = input
if self.miner.verify_docker(tmp_docker):
return tmp_docker, args, kwargs
else:
raise ValueError('`docker` is not supported by current specification.')

def __checkpoint_identifier_to_accession(self,docker,*args,**kwargs):
def __checkpoint_identifier_to_accession(self, docker, *args, **kwargs):
accessions = next(self.miner.yield_accession_by_identifier(docker, **kwargs))
return accessions, args, kwargs


@property
def inlet(self):
""":class:`.DockerIdentifierMedium`"""
Expand Down
5 changes: 5 additions & 0 deletions pmaf/sequence/_multiple/_multiple.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,11 @@ def name(self):
"""Name of the instance."""
return self.__name

@name.setter
def name(self, value):
"""Name of the instance."""
self.__name = value

@property
def is_alignment(self):
"""Is mutli-sequence is aligned or not."""
Expand Down
5 changes: 5 additions & 0 deletions pmaf/sequence/_sequence/_nucleotide.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,11 @@ def name(self) -> str:
"""Name of the sequence instance."""
return self.__name

@name.setter
def name(self, value):
"""Name of the instance."""
self.__name = value

@property
def is_buckled(self) -> bool:
"""Is sequence instance is buckled or not."""
Expand Down

0 comments on commit b3a2f04

Please sign in to comment.