Skip to content

Commit

Permalink
Merge pull request #55 from robertopreste/enh/save-output
Browse files Browse the repository at this point in the history
ENH: Add option to save output
  • Loading branch information
robertopreste committed Mar 22, 2020
2 parents 4d9ec60 + 1c92cea commit 54c0544
Show file tree
Hide file tree
Showing 35 changed files with 279 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .pyup.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
branch: dev
branch: master
update: insecure
40 changes: 27 additions & 13 deletions apybiomart/apybiomart.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,65 +7,79 @@
FiltersServer, Query


def find_marts() -> pd.DataFrame:
"""Retrieve and list available marts."""
server = MartServer()
def find_marts(save: bool = False) -> pd.DataFrame:
"""Retrieve and list available marts.
Args:
save: save results to a CSV file [default: False]
"""
server = MartServer(save=save)
return server.find_marts()


def find_datasets(mart: str = "ENSEMBL_MART_ENSEMBL") -> pd.DataFrame:
def find_datasets(mart: str = "ENSEMBL_MART_ENSEMBL",
save: bool = False) -> pd.DataFrame:
"""Retrieve and list available datasets for a given mart.
Args:
mart: BioMart mart name (default: "ENSEMBL_MART_ENSEMBL")
save: save results to a CSV file [default: False]
"""
server = DatasetServer(mart)
server = DatasetServer(mart, save=save)
return server.find_datasets()


def find_attributes(dataset: str = "hsapiens_gene_ensembl") -> pd.DataFrame:
def find_attributes(dataset: str = "hsapiens_gene_ensembl",
save: bool = False) -> pd.DataFrame:
"""Retrieve and list available attributes for a given mart.
Args:
dataset: BioMart dataset name (default: "hsapiens_gene_ensembl")
save: save results to a CSV file [default: False]
"""
server = AttributesServer(dataset)
server = AttributesServer(dataset, save=save)
return server.find_attributes()


def find_filters(dataset: str = "hsapiens_gene_ensembl") -> pd.DataFrame:
def find_filters(dataset: str = "hsapiens_gene_ensembl",
save: bool = False) -> pd.DataFrame:
"""Retrieve and list available filters for a given mart.
Args:
dataset: BioMart dataset name (default: "hsapiens_gene_ensembl")
save: save results to a CSV file [default: False]
"""
server = FiltersServer(dataset)
server = FiltersServer(dataset, save=save)
return server.find_filters()


def query(attributes: List[str],
filters: Dict[str, Union[str, int, list, tuple, bool]],
dataset: str = "hsapiens_gene_ensembl") -> pd.DataFrame:
dataset: str = "hsapiens_gene_ensembl",
save: bool = False) -> pd.DataFrame:
"""Launch synchronous query using the given attributes, filters and dataset.
Args:
attributes: list of attributes to include
filters: dict of filter name : value to filter results
dataset: BioMart dataset name (default: "hsapiens_gene_ensembl")
save: save results to a CSV file [default: False]
"""
server = Query(attributes, filters, dataset)
server = Query(attributes, filters, dataset, save=save)
return server.query()


async def aquery(attributes: List[str],
filters: Dict[str, Union[str, int, list, tuple, bool]],
dataset: str = "hsapiens_gene_ensembl") -> pd.DataFrame:
dataset: str = "hsapiens_gene_ensembl",
save: bool = False) -> pd.DataFrame:
"""Launch asynchronous query using the given attributes, filters and dataset.
Args:
attributes: list of attributes to include
filters: dict of filter name : value to filter results
dataset: BioMart dataset name (default: "hsapiens_gene_ensembl")
save: save results to a CSV file [default: False]
"""
server = Query(attributes, filters, dataset)
server = Query(attributes, filters, dataset, save=save)
return await server.aquery()
53 changes: 39 additions & 14 deletions apybiomart/classes.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Created by Roberto Preste
import io
from typing import Optional, Dict, Any, Tuple, Generator, List, Union
from xml.etree import ElementTree as ET

import asyncio
import aiohttp
import io
import requests
import pandas as pd
from xml.etree import ElementTree as ET
from typing import Optional, Dict, Any, Tuple, Generator, List, Union


class _BiomartException(Exception):
Expand All @@ -20,11 +21,14 @@ class _Server:
Attributes:
host: URL to connect to
save: save results to a CSV file [default: False]
"""

def __init__(self,
host: str = "http://www.ensembl.org/biomart/martservice"):
host: str = "http://www.ensembl.org/biomart/martservice",
save: bool = False):
self.host = host
self.save = save
if not self._check_connection():
raise _BiomartException("No internet connection available!")

Expand Down Expand Up @@ -76,8 +80,8 @@ async def get_async(self,
class MartServer(_Server):
"""Class used to retrieve and list available marts."""

def __init__(self):
super().__init__()
def __init__(self, save: bool = False):
super().__init__(save=save)

def find_marts(self) -> pd.DataFrame:
"""Return the list of available marts as a dataframe.
Expand All @@ -88,6 +92,9 @@ def find_marts(self) -> pd.DataFrame:
df = pd.DataFrame.from_records(self._fetch_marts(),
columns=["name", "display_name"])
df.columns = ["Mart_ID", "Mart_name"]
df.replace(pd.np.nan, "", inplace=True)
if self.save:
df.to_csv("apybiomart_marts.csv", index=False)

return df

Expand Down Expand Up @@ -131,8 +138,8 @@ class DatasetServer(_Server):
mart: BioMart mart name
"""

def __init__(self, mart: str):
super().__init__()
def __init__(self, mart: str, save: bool = False):
super().__init__(save=save)
self.mart = mart

def find_datasets(self) -> pd.DataFrame:
Expand All @@ -147,6 +154,9 @@ def find_datasets(self) -> pd.DataFrame:
usecols=["name", "display_name"])
df["mart"] = self.mart
df.columns = ["Dataset_ID", "Dataset_name", "Mart_ID"]
df.replace(pd.np.nan, "", inplace=True)
if self.save:
df.to_csv("apybiomart_datasets.csv", index=False)

return df

Expand All @@ -172,8 +182,8 @@ class AttributesServer(_Server):
dataset: BioMart dataset name
"""

def __init__(self, dataset: str):
super().__init__()
def __init__(self, dataset: str, save: bool = False):
super().__init__(save=save)
self.dataset = dataset

def find_attributes(self) -> pd.DataFrame:
Expand All @@ -186,6 +196,9 @@ def find_attributes(self) -> pd.DataFrame:
df["dataset"] = self.dataset
df.columns = ["Attribute_ID", "Attribute_name",
"Attribute_description", "Dataset_ID"]
df.replace(pd.np.nan, "", inplace=True)
if self.save:
df.to_csv("apybiomart_attributes.csv", index=False)

return df

Expand Down Expand Up @@ -236,8 +249,8 @@ class FiltersServer(_Server):
dataset: BioMart dataset name
"""

def __init__(self, dataset: str):
super().__init__()
def __init__(self, dataset: str, save: bool = False):
super().__init__(save=save)
self.dataset = dataset

def find_filters(self) -> pd.DataFrame:
Expand All @@ -250,6 +263,9 @@ def find_filters(self) -> pd.DataFrame:
df["dataset"] = self.dataset
df.columns = ["Filter_ID", "Filter_type",
"Filter_description", "Dataset_ID"]
df.replace(pd.np.nan, "", inplace=True)
if self.save:
df.to_csv("apybiomart_filters.csv", index=False)

return df

Expand Down Expand Up @@ -304,8 +320,9 @@ class Query(_Server):
def __init__(self,
attributes: List[str],
filters: Dict[str, Union[str, int, list, tuple, bool]],
dataset: str):
super().__init__()
dataset: str,
save: bool = False):
super().__init__(save=save)
self.attributes = attributes
self.filters = filters
self.dataset = dataset
Expand Down Expand Up @@ -357,6 +374,10 @@ def query(self) -> pd.DataFrame:
# Type error is raised of a data type is not understood by pandas
except TypeError as err:
raise ValueError("Non valid data type is used in dtypes")
result.replace(pd.np.nan, "", inplace=True)

if self.save:
result.to_csv("apybiomart_query.csv", index=False)

return result

Expand Down Expand Up @@ -407,6 +428,10 @@ async def aquery(self) -> pd.DataFrame:
# Type error is raised of a data type is not understood by pandas
except TypeError as err:
raise ValueError("Non valid data type is used in dtypes")
result.replace(pd.np.nan, "", inplace=True)

if self.save:
result.to_csv("apybiomart_aquery.csv", index=False)

return result

Expand Down
6 changes: 4 additions & 2 deletions apybiomart/commands/attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
@click.command("attributes")
@click.option("--dataset", default="hsapiens_gene_ensembl", type=str,
help="BioMart dataset name (default: 'hsapiens_gene_ensembl')")
def cli_attributes(dataset):
@click.option("--save", "-s", default=False, is_flag=True,
help="Save results to a CSV file [default: False]")
def cli_attributes(dataset, save):
"""Retrieve and list available attributes for a given mart."""
pd.set_option("max_rows", 999)
attributes = find_attributes(dataset)
attributes = find_attributes(dataset, save=save)
attributes.columns = [col.replace("_", " ") for col in attributes.columns]
click.echo(attributes)
return 0
6 changes: 4 additions & 2 deletions apybiomart/commands/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
@click.command("datasets")
@click.option("--mart", default="ENSEMBL_MART_ENSEMBL", type=str,
help="BioMart mart name (default: 'ENSEMBL_MART_ENSEMBL')")
def cli_datasets(mart):
@click.option("--save", "-s", default=False, is_flag=True,
help="Save results to a CSV file [default: False]")
def cli_datasets(mart, save):
"""Retrieve and list available datasets for a given mart."""
pd.set_option("max_rows", 999)
datasets = find_datasets(mart)
datasets = find_datasets(mart, save=save)
datasets.columns = [col.replace("_", " ") for col in datasets.columns]
click.echo(datasets)
return 0
6 changes: 4 additions & 2 deletions apybiomart/commands/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
@click.command("filters")
@click.option("--dataset", default="hsapiens_gene_ensembl", type=str,
help="BioMart dataset name (default: 'hsapiens_gene_ensembl')")
def cli_filters(dataset):
@click.option("--save", "-s", default=False, is_flag=True,
help="Save results to a CSV file [default: False]")
def cli_filters(dataset, save):
"""Retrieve and list available filters for a given mart."""
pd.set_option("max_rows", 999)
filters = find_filters(dataset)
filters = find_filters(dataset, save=save)
filters.columns = [col.replace("_", " ") for col in filters.columns]
click.echo(filters)
return 0
6 changes: 4 additions & 2 deletions apybiomart/commands/marts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@


@click.command("marts")
def cli_marts():
@click.option("--save", "-s", default=False, is_flag=True,
help="Save results to a CSV file [default: False]")
def cli_marts(save):
"""Retrieve and list available marts."""
pd.set_option("max_rows", 999)
marts = find_marts()
marts = find_marts(save=save)
marts.columns = [col.replace("_", " ") for col in marts.columns]
click.echo(marts)
return 0
4 changes: 2 additions & 2 deletions apybiomart/tests/create_suite.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Created by Roberto Preste
import asyncio
import os
import pandas as pd

import asyncio
import apybiomart as apy

DATADIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
Expand Down
Binary file modified apybiomart/tests/data/attributes_chircus_snp.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/attributes_closure_ECO.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/attributes_hsapiens_encode.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/attributes_hsapiens_gene_ensembl.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/attributes_hsapiens_peak.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_ensembl.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_funcgen.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_genomic.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_mouse.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_ontology.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_sequence.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/datasets_snp.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/filters_chircus_snp.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/filters_closure_ECO.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/filters_hsapiens_encode.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/filters_hsapiens_gene_ensembl.pkl
Binary file not shown.
Binary file modified apybiomart/tests/data/filters_hsapiens_peak.pkl
Binary file not shown.
26 changes: 26 additions & 0 deletions apybiomart/tests/test_aquery.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Created by Roberto Preste
import os
import pytest

import asyncio
import pandas as pd
from pandas.testing import assert_frame_equal

from apybiomart import aquery


Expand All @@ -22,6 +26,28 @@ def test_aquery_default(df_query_ensembl_hsapiens_gene_chrom_2):
assert_frame_equal(result, expect)


def test_aquery_save(df_query_ensembl_hsapiens_gene_chrom_2):
"""Test the saved async query results for the default dataset
(hsapiens_gene_ensembl)."""
expect = (df_query_ensembl_hsapiens_gene_chrom_2
.reset_index(drop=True))

loop = asyncio.get_event_loop()
result = loop.run_until_complete(
aquery(attributes=["ensembl_gene_id", "external_gene_name"],
filters={"chromosome_name": "2"},
save=True)
)
saved = (pd.read_csv("apybiomart_aquery.csv")
.replace(pd.np.nan, "")
.reset_index(drop=True))

try:
assert_frame_equal(saved, expect)
finally:
os.remove("apybiomart_aquery.csv")


def test_aquery_default_int(df_query_ensembl_hsapiens_gene_chrom_2):
"""Test the async query results for the default dataset
(hsapiens_gene_ensembl) with int filters parameter."""
Expand Down
Loading

0 comments on commit 54c0544

Please sign in to comment.