nextstrain · joverlee521 · May 3, 2022 · Apr 7, 2022 · Apr 7, 2022
diff --git a/augur/align.py b/augur/align.py
@@ -266,6 +266,7 @@ def strip_non_reference(aln, reference, insertion_csv=None):
     list
         list of trimmed sequences, effectively a multiple alignment
 
+
     Tests
     -----
     >>> [s.name for s in strip_non_reference(read_alignment("tests/data/align/test_aligned_sequences.fasta"), "with_gaps")]

diff --git a/augur/clades.py b/augur/clades.py
@@ -18,21 +18,21 @@ def read_in_clade_definitions(clade_file):
     Inheritance is allowed, but needs to be acyclic. Alleles can be overwritten by inheriting clades.
 
     Sites are 1 indexed in the file, and are converted to 0 indexed in the output
-    
+
     Empty lines are ignored, comments after # are ignored
 
-    Format
-    ------
-    clade      gene    site     alt
-    Clade_1    ctpE    81       D
-    Clade_2    nuc     30642    T
-    Clade_3    nuc     444296   A
-    Clade_3    S       1        P
-    \\# Clade_4 inherits from Clade_3
-    Clade_4    clade   Clade_3
-    Clade_4    pks8    634      T
-    \\# Inherited allele can be overwritten
-    Clade_4    S       1        L
+    Format::
+
+        clade      gene    site     alt
+        Clade_1    ctpE    81       D
+        Clade_2    nuc     30642    T
+        Clade_3    nuc     444296   A
+        Clade_3    S       1        P
+        # Clade_4 inherits from Clade_3
+        Clade_4    clade   Clade_3
+        Clade_4    pks8    634      T
+        # Inherited allele can be overwritten
+        Clade_4    S       1        L
 
     Parameters
     ----------
@@ -74,22 +74,22 @@ def read_in_clade_definitions(clade_file):
     # This way all clades can be reached by traversal
     for clade in df.clade.unique():
         G.add_edge(root, clade)
-    
+
     # Build inheritance graph
     # For clades that inherit, disconnect from root
     # Add edge from parent
     for _, row in clade_inheritance_rows.iterrows():
         G.remove_edge(root, row.clade)
         G.add_edge(row.site, row.clade)
-    
+
     if not nx.is_directed_acyclic_graph(G):
         raise ValueError(f"Clade definitions contain cycles {list(nx.simple_cycles(G))}")
 
     # Traverse graph top down, so that children can inherit from parents and grandparents
     # Topological sort ensures parents are visited before children
     # islice is used to skip the root node (which has no parent)
     for clade in islice(nx.topological_sort(G),1,None):
-        # Get name of parent clade 
+        # Get name of parent clade
         # G.predecessors(clade) returns iterator, thus next() necessary
         # despite the fact that there should only be one parent
         parent_clade = next(G.predecessors(clade))
@@ -99,7 +99,7 @@ def read_in_clade_definitions(clade_file):
         for _, row in df[(df.clade == clade) & (df.gene != 'clade')].iterrows():
             # Overwrite of parent alleles is possible and happens here
             clades[clade][(row.gene, int(row.site)-1)] = row.alt
-    
+
     # Convert items from dict[str, dict[(str,int),str]] to dict[str, list[(str,int,str)]]
     clades = {
         clade: [
@@ -110,7 +110,7 @@ def read_in_clade_definitions(clade_file):
         # If clause avoids root (helper) from being emmitted
         if clade != root
     }
-    
+
     return clades
 
 

diff --git a/augur/distance.py b/augur/distance.py
@@ -177,6 +177,7 @@ def read_distance_map(map_file):
     dict :
         Python representation of the distance map JSON
 
+
     >>> sorted(read_distance_map("tests/data/distance_map_weight_per_site.json").items())
     [('default', 0), ('map', {'HA1': {144: 1}})]
     >>> sorted(read_distance_map("tests/data/distance_map_weight_per_site_and_sequence.json").items())
@@ -236,6 +237,7 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map,
     float :
         distance between node sequences based on the given map
 
+
     >>> node_a_sequences = {"gene": "ACTG"}
     >>> node_b_sequences = {"gene": "ACGG"}
     >>> distance_map = {"default": 0, "map": {}}

diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -565,6 +565,7 @@ def set_data_provenance(data_json, config):
     config : dict
         config JSON with an expected ``data_provenance`` key
 
+
     >>> config = {"data_provenance": [{"name": "GISAID"}, {"name": "INSDC"}]}
     >>> data_json = {"meta": {}}
     >>> set_data_provenance(data_json, config)

diff --git a/augur/filter.py b/augur/filter.py
@@ -92,6 +92,7 @@ def filter_by_exclude_all(metadata):
     set[str]:
         Empty set of strains
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"])
     >>> filter_by_exclude_all(metadata)
     set()
@@ -114,6 +115,7 @@ def filter_by_exclude(metadata, exclude_file):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"])
     >>> with NamedTemporaryFile(delete=False) as exclude_file:
     ...     characters_written = exclude_file.write(b'strain1')
@@ -143,6 +145,7 @@ def parse_filter_query(query):
     str :
         Value of column to query
 
+
     >>> parse_filter_query("property=value")
     ('property', <built-in function eq>, 'value')
     >>> parse_filter_query("property!=value")
@@ -177,6 +180,7 @@ def filter_by_exclude_where(metadata, exclude_where):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"])
     >>> filter_by_exclude_where(metadata, "region!=Europe")
     {'strain2'}
@@ -228,6 +232,7 @@ def filter_by_query(metadata, query):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"])
     >>> filter_by_query(metadata, "region == 'Africa'")
     {'strain1'}
@@ -256,6 +261,7 @@ def filter_by_ambiguous_date(metadata, date_column="date", ambiguity="any"):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-XX"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
     >>> filter_by_ambiguous_date(metadata)
     {'strain2'}
@@ -298,6 +304,7 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
     >>> filter_by_date(metadata, min_date=numeric_date("2020-01-02"))
     {'strain2'}
@@ -352,6 +359,7 @@ def filter_by_sequence_index(metadata, sequence_index):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}]).set_index("strain")
     >>> filter_by_sequence_index(metadata, sequence_index)
@@ -381,6 +389,7 @@ def filter_by_sequence_length(metadata, sequence_index, min_length=0):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
     >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
@@ -417,6 +426,7 @@ def filter_by_non_nucleotide(metadata, sequence_index):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "invalid_nucleotides": 0}, {"strain": "strain2", "invalid_nucleotides": 1}]).set_index("strain")
     >>> filter_by_non_nucleotide(metadata, sequence_index)
@@ -447,6 +457,7 @@ def include(metadata, include_file):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"])
     >>> with NamedTemporaryFile(delete=False) as include_file:
     ...     characters_written = include_file.write(b'strain1')
@@ -479,6 +490,7 @@ def include_by_include_where(metadata, include_where):
     set[str]:
         Strains that pass the filter
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"])
     >>> include_by_include_where(metadata, "region!=Europe")
     {'strain1'}
@@ -664,6 +676,7 @@ def filter_kwargs_to_str(kwargs):
     str :
         String representation of the kwargs for reporting.
 
+
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain")
     >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
     >>> filter_kwargs_to_str(exclude_by[0][1])
@@ -718,9 +731,11 @@ def apply_filters(metadata, exclude_by, include_by):
     list[dict] :
         Strains to force-include along with the function that filtered them and the arguments used to run the function.
 
+
     For example, filter data by minimum date, but force the include of strains
     from Africa.
 
+
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-10-02"}, {"region": "North America", "date": "2020-01-01"}], index=["strain1", "strain2", "strain3"])
     >>> exclude_by = [(filter_by_date, {"min_date": numeric_date("2020-04-01")})]
     >>> include_by = [(include_by_include_where, {"include_where": "region=Africa"})]
@@ -825,6 +840,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     list :
         A list of dictionaries with strains that were skipped from grouping and the reason why (see also: `apply_filters` output).
 
+
     >>> strains = ["strain1", "strain2"]
     >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020-01-01", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain")
     >>> group_by = ["region"]

diff --git a/augur/io.py b/augur/io.py
@@ -58,6 +58,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
     KeyError :
         When the metadata file does not have any valid index columns.
 
+
     For standard use, request a metadata file and get a pandas DataFrame.
 
     >>> read_metadata("tests/functional/filter/metadata.tsv").index.values[0]

diff --git a/augur/titer_model.py b/augur/titer_model.py
@@ -35,6 +35,7 @@ def load_from_file(filenames, excluded_sources=None):
         tuple (dict, list, list)
             tuple of a dict of titer measurements, list of strains, list of sources
 
+
         >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv")
         >>> type(measurements)
         <class 'dict'>
@@ -107,6 +108,7 @@ def count_strains(titers):
         dict
             number of measurements per strain
 
+
         >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv")
         >>> titer_counts = TiterCollection.count_strains(measurements)
         >>> titer_counts["A/Acores/11/2013"]
@@ -142,6 +144,7 @@ def filter_strains(titers, strains):
             reduced dictionary of titer measurements containing only those were
             test and reference virus are part of the strain list
 
+
         >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv")
         >>> len(measurements)
         11

diff --git a/augur/tree.py b/augur/tree.py
@@ -59,6 +59,7 @@ def check_conflicting_args(tree_builder_args, defaults):
     ConflictingArgumentsException
         When any user-provided arguments match those in the defaults.
 
+
     >>> defaults = ("-nt", "-m", "-s")
     >>> check_conflicting_args("-czb -n 2", defaults)
     >>> check_conflicting_args("-czb -nt 2", defaults)

diff --git a/augur/utils.py b/augur/utils.py
@@ -646,8 +646,8 @@ def read_bed_file(bed_file):
     bed_file : str
         Path to the BED file
 
-    Returns:
-    --------
+    Returns
+    -------
     list[int]:
         Sorted list of unique zero-indexed sites
     """
@@ -677,8 +677,8 @@ def read_mask_file(mask_file):
     mask_file : str
         Path to the masking file
 
-    Returns:
-    --------
+    Returns
+    -------
     list[int]:
         Sorted list of unique zero-indexed sites
     """
@@ -726,7 +726,7 @@ def read_strains(*files, comment_char="#"):
     set of distinct strains.
 
     Strain names can be commented with full-line or inline comments. For
-    example, the following is a valid strain names file:
+    example, the following is a valid strain names file::
 
         # this is a comment at the top of the file
         strain1  # exclude strain1 because it isn't sequenced properly

diff --git a/docs/api/augur.io.rst b/docs/api/augur.io.rst
@@ -0,0 +1,7 @@
+augur.io module
+===============
+
+.. automodule:: augur.io
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/api/augur.rst b/docs/api/augur.rst
@@ -32,6 +32,7 @@ Submodules
    augur.import
    augur.import_beast
    augur.index
+   augur.io
    augur.lbi
    augur.mask
    augur.parse