From fb46a0bdc578507ebcffa2668bef59739d5553bc Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 7 Apr 2022 14:05:46 -0700 Subject: [PATCH 1/2] Add autogenerated API documentation for augur.io Resolves #784. --- docs/api/augur.io.rst | 7 +++++++ docs/api/augur.rst | 1 + 2 files changed, 8 insertions(+) create mode 100644 docs/api/augur.io.rst diff --git a/docs/api/augur.io.rst b/docs/api/augur.io.rst new file mode 100644 index 000000000..84446ef6a --- /dev/null +++ b/docs/api/augur.io.rst @@ -0,0 +1,7 @@ +augur.io module +=============== + +.. automodule:: augur.io + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/augur.rst b/docs/api/augur.rst index fc6805476..6b34aae8e 100644 --- a/docs/api/augur.rst +++ b/docs/api/augur.rst @@ -32,6 +32,7 @@ Submodules augur.import augur.import_beast augur.index + augur.io augur.lbi augur.mask augur.parse From a20a6a2a5b9d7a1111b9cca81ed3b00ea363752a Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 7 Apr 2022 15:17:44 -0700 Subject: [PATCH 2/2] API docs: fix various formatting issues 1. Add an extra line between sections and doctests to correctly render the doctest blocks 2. Add literal blocks using `::` to preserve formatting of example files 3. Correct `Returns` subtitles --- augur/align.py | 1 + augur/clades.py | 36 ++++++++++++++++++------------------ augur/distance.py | 2 ++ augur/export_v2.py | 1 + augur/filter.py | 16 ++++++++++++++++ augur/io.py | 1 + augur/titer_model.py | 3 +++ augur/tree.py | 1 + augur/utils.py | 10 +++++----- 9 files changed, 48 insertions(+), 23 deletions(-) diff --git a/augur/align.py b/augur/align.py index 30cd8ea2f..d8d24cb3a 100644 --- a/augur/align.py +++ b/augur/align.py @@ -266,6 +266,7 @@ def strip_non_reference(aln, reference, insertion_csv=None): list list of trimmed sequences, effectively a multiple alignment + Tests ----- >>> [s.name for s in strip_non_reference(read_alignment("tests/data/align/test_aligned_sequences.fasta"), "with_gaps")] diff --git a/augur/clades.py b/augur/clades.py index 677911ef2..c040ab18e 100644 --- a/augur/clades.py +++ b/augur/clades.py @@ -18,21 +18,21 @@ def read_in_clade_definitions(clade_file): Inheritance is allowed, but needs to be acyclic. Alleles can be overwritten by inheriting clades. Sites are 1 indexed in the file, and are converted to 0 indexed in the output - + Empty lines are ignored, comments after # are ignored - Format - ------ - clade gene site alt - Clade_1 ctpE 81 D - Clade_2 nuc 30642 T - Clade_3 nuc 444296 A - Clade_3 S 1 P - \\# Clade_4 inherits from Clade_3 - Clade_4 clade Clade_3 - Clade_4 pks8 634 T - \\# Inherited allele can be overwritten - Clade_4 S 1 L + Format:: + + clade gene site alt + Clade_1 ctpE 81 D + Clade_2 nuc 30642 T + Clade_3 nuc 444296 A + Clade_3 S 1 P + # Clade_4 inherits from Clade_3 + Clade_4 clade Clade_3 + Clade_4 pks8 634 T + # Inherited allele can be overwritten + Clade_4 S 1 L Parameters ---------- @@ -74,14 +74,14 @@ def read_in_clade_definitions(clade_file): # This way all clades can be reached by traversal for clade in df.clade.unique(): G.add_edge(root, clade) - + # Build inheritance graph # For clades that inherit, disconnect from root # Add edge from parent for _, row in clade_inheritance_rows.iterrows(): G.remove_edge(root, row.clade) G.add_edge(row.site, row.clade) - + if not nx.is_directed_acyclic_graph(G): raise ValueError(f"Clade definitions contain cycles {list(nx.simple_cycles(G))}") @@ -89,7 +89,7 @@ def read_in_clade_definitions(clade_file): # Topological sort ensures parents are visited before children # islice is used to skip the root node (which has no parent) for clade in islice(nx.topological_sort(G),1,None): - # Get name of parent clade + # Get name of parent clade # G.predecessors(clade) returns iterator, thus next() necessary # despite the fact that there should only be one parent parent_clade = next(G.predecessors(clade)) @@ -99,7 +99,7 @@ def read_in_clade_definitions(clade_file): for _, row in df[(df.clade == clade) & (df.gene != 'clade')].iterrows(): # Overwrite of parent alleles is possible and happens here clades[clade][(row.gene, int(row.site)-1)] = row.alt - + # Convert items from dict[str, dict[(str,int),str]] to dict[str, list[(str,int,str)]] clades = { clade: [ @@ -110,7 +110,7 @@ def read_in_clade_definitions(clade_file): # If clause avoids root (helper) from being emmitted if clade != root } - + return clades diff --git a/augur/distance.py b/augur/distance.py index 19dd2a11e..e7ab043c0 100644 --- a/augur/distance.py +++ b/augur/distance.py @@ -177,6 +177,7 @@ def read_distance_map(map_file): dict : Python representation of the distance map JSON + >>> sorted(read_distance_map("tests/data/distance_map_weight_per_site.json").items()) [('default', 0), ('map', {'HA1': {144: 1}})] >>> sorted(read_distance_map("tests/data/distance_map_weight_per_site_and_sequence.json").items()) @@ -236,6 +237,7 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, float : distance between node sequences based on the given map + >>> node_a_sequences = {"gene": "ACTG"} >>> node_b_sequences = {"gene": "ACGG"} >>> distance_map = {"default": 0, "map": {}} diff --git a/augur/export_v2.py b/augur/export_v2.py index cca9d3581..cb5da134e 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -565,6 +565,7 @@ def set_data_provenance(data_json, config): config : dict config JSON with an expected ``data_provenance`` key + >>> config = {"data_provenance": [{"name": "GISAID"}, {"name": "INSDC"}]} >>> data_json = {"meta": {}} >>> set_data_provenance(data_json, config) diff --git a/augur/filter.py b/augur/filter.py index 772b50ae4..3b067d197 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -92,6 +92,7 @@ def filter_by_exclude_all(metadata): set[str]: Empty set of strains + >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> filter_by_exclude_all(metadata) set() @@ -114,6 +115,7 @@ def filter_by_exclude(metadata, exclude_file): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> with NamedTemporaryFile(delete=False) as exclude_file: ... characters_written = exclude_file.write(b'strain1') @@ -143,6 +145,7 @@ def parse_filter_query(query): str : Value of column to query + >>> parse_filter_query("property=value") ('property', , 'value') >>> parse_filter_query("property!=value") @@ -177,6 +180,7 @@ def filter_by_exclude_where(metadata, exclude_where): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> filter_by_exclude_where(metadata, "region!=Europe") {'strain2'} @@ -228,6 +232,7 @@ def filter_by_query(metadata, query): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> filter_by_query(metadata, "region == 'Africa'") {'strain1'} @@ -256,6 +261,7 @@ def filter_by_ambiguous_date(metadata, date_column="date", ambiguity="any"): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-XX"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> filter_by_ambiguous_date(metadata) {'strain2'} @@ -298,6 +304,7 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> filter_by_date(metadata, min_date=numeric_date("2020-01-02")) {'strain2'} @@ -352,6 +359,7 @@ def filter_by_sequence_index(metadata, sequence_index): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}]).set_index("strain") >>> filter_by_sequence_index(metadata, sequence_index) @@ -381,6 +389,7 @@ def filter_by_sequence_length(metadata, sequence_index, min_length=0): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain") >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000) @@ -417,6 +426,7 @@ def filter_by_non_nucleotide(metadata, sequence_index): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "invalid_nucleotides": 0}, {"strain": "strain2", "invalid_nucleotides": 1}]).set_index("strain") >>> filter_by_non_nucleotide(metadata, sequence_index) @@ -447,6 +457,7 @@ def include(metadata, include_file): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> with NamedTemporaryFile(delete=False) as include_file: ... characters_written = include_file.write(b'strain1') @@ -479,6 +490,7 @@ def include_by_include_where(metadata, include_where): set[str]: Strains that pass the filter + >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> include_by_include_where(metadata, "region!=Europe") {'strain1'} @@ -664,6 +676,7 @@ def filter_kwargs_to_str(kwargs): str : String representation of the kwargs for reporting. + >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain") >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})] >>> filter_kwargs_to_str(exclude_by[0][1]) @@ -718,9 +731,11 @@ def apply_filters(metadata, exclude_by, include_by): list[dict] : Strains to force-include along with the function that filtered them and the arguments used to run the function. + For example, filter data by minimum date, but force the include of strains from Africa. + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-10-02"}, {"region": "North America", "date": "2020-01-01"}], index=["strain1", "strain2", "strain3"]) >>> exclude_by = [(filter_by_date, {"min_date": numeric_date("2020-04-01")})] >>> include_by = [(include_by_include_where, {"include_where": "region=Africa"})] @@ -825,6 +840,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): list : A list of dictionaries with strains that were skipped from grouping and the reason why (see also: `apply_filters` output). + >>> strains = ["strain1", "strain2"] >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020-01-01", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain") >>> group_by = ["region"] diff --git a/augur/io.py b/augur/io.py index 92514109d..0103c7664 100644 --- a/augur/io.py +++ b/augur/io.py @@ -58,6 +58,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) KeyError : When the metadata file does not have any valid index columns. + For standard use, request a metadata file and get a pandas DataFrame. >>> read_metadata("tests/functional/filter/metadata.tsv").index.values[0] diff --git a/augur/titer_model.py b/augur/titer_model.py index 5552aeaf4..6dc5b45e1 100644 --- a/augur/titer_model.py +++ b/augur/titer_model.py @@ -35,6 +35,7 @@ def load_from_file(filenames, excluded_sources=None): tuple (dict, list, list) tuple of a dict of titer measurements, list of strains, list of sources + >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> type(measurements) @@ -107,6 +108,7 @@ def count_strains(titers): dict number of measurements per strain + >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> titer_counts = TiterCollection.count_strains(measurements) >>> titer_counts["A/Acores/11/2013"] @@ -142,6 +144,7 @@ def filter_strains(titers, strains): reduced dictionary of titer measurements containing only those were test and reference virus are part of the strain list + >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> len(measurements) 11 diff --git a/augur/tree.py b/augur/tree.py index 8ebda6844..8b66f0eee 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -59,6 +59,7 @@ def check_conflicting_args(tree_builder_args, defaults): ConflictingArgumentsException When any user-provided arguments match those in the defaults. + >>> defaults = ("-nt", "-m", "-s") >>> check_conflicting_args("-czb -n 2", defaults) >>> check_conflicting_args("-czb -nt 2", defaults) diff --git a/augur/utils.py b/augur/utils.py index b2207befe..e5d32a001 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -646,8 +646,8 @@ def read_bed_file(bed_file): bed_file : str Path to the BED file - Returns: - -------- + Returns + ------- list[int]: Sorted list of unique zero-indexed sites """ @@ -677,8 +677,8 @@ def read_mask_file(mask_file): mask_file : str Path to the masking file - Returns: - -------- + Returns + ------- list[int]: Sorted list of unique zero-indexed sites """ @@ -726,7 +726,7 @@ def read_strains(*files, comment_char="#"): set of distinct strains. Strain names can be commented with full-line or inline comments. For - example, the following is a valid strain names file: + example, the following is a valid strain names file:: # this is a comment at the top of the file strain1 # exclude strain1 because it isn't sequenced properly