Merge pull request #997: filter: Split filter.py into smaller files

nextstrain · Jan 12, 2023 · d7d4380 · d7d4380
2 parents c8900c5 + c00db66
commit d7d4380
Show file tree

Hide file tree

Showing 9 changed files with 2,284 additions and 2,240 deletions.
diff --git a/augur/filter.py b/augur/filter.py
diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -0,0 +1,91 @@
+"""
+Filter and subsample a sequence set.
+"""
+from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
+
+
+# Use sorted() for reproducible output
+GROUP_BY_GENERATED_COLUMNS = {'year', 'month', 'week'}
+
+def register_arguments(parser):
+    """
+    Add arguments to parser.
+    Kept as a separate function than `register_parser` to continue to support
+    unit tests that use this function to create argparser.
+    """
+    input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
+    input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
+    input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
+    input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
+    input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
+    input_group.add_argument('--metadata-id-columns', default=["strain", "name"], nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
+
+    metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
+    metadata_filter_group.add_argument(
+        '--query',
+        help="""Filter samples by attribute.
+        Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
+        (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
+    )
+    metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
+    metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
+    metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
+                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
+    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
+    metadata_filter_group.add_argument('--exclude-where', nargs='+',
+                                help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
+    metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
+    metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities or subsampling")
+    metadata_filter_group.add_argument('--include-where', nargs='+',
+                                help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.")
+
+    sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
+    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences")
+    sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
+
+    subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
+    subsample_group.add_argument('--group-by', nargs='+', help=f"""
+        categories with respect to subsample.
+        Notes:
+        (1) Grouping by {sorted(GROUP_BY_GENERATED_COLUMNS)} is only supported when there is a 'date' column in the metadata.
+        (2) 'week' uses the ISO week numbering system, where a week starts on a Monday and ends on a Sunday.
+        (3) 'month' and 'week' grouping cannot be used together.
+        (4) Custom columns {sorted(GROUP_BY_GENERATED_COLUMNS)} in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
+    subsample_limits_group = subsample_group.add_mutually_exclusive_group()
+    subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
+    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
+    probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Allow probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
+    subsample_group.add_argument('--priority', type=str, help="""tab-delimited file with list of priority scores for strains (e.g., "<strain>\\t<priority>") and no header.
+    When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group.
+    Higher numbers indicate higher priority.
+    Since priorities represent relative values between strains, these values can be arbitrary.""")
+    subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")
+
+    output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)")
+    output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
+    output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
+    output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
+    output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
+
+    parser.set_defaults(probabilistic_sampling=True)
+
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("filter", help=__doc__)
+    register_arguments(parser)
+    return parser
+
+
+def run(args):
+    '''
+    filter and subsample a set of sequences into an analysis set
+    '''
+    from .validate_arguments import validate_arguments
+    # Validate arguments before attempting any I/O.
+    if not validate_arguments(args):
+        return 1
+
+    from ._run import run as _run
+    return _run(args)