From c357955db727f03c51396747d8d01c885c111325 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 8 Mar 2024 14:28:59 -0800 Subject: [PATCH 1/2] wip: add 'gene' feature capture when reading genbank files --- augur/utils.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/augur/utils.py b/augur/utils.py index bd1a9ccd7..d6f3ab466 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -385,17 +385,18 @@ def _read_nuc_annotation_from_genbank(record, reference): def _read_genbank(reference, feature_names): """ - Read a GenBank file. We only read GenBank feature keys 'CDS' or 'source'. + Read a GenBank file. We only read GenBank feature keys 'source', 'CDS', or 'gene'. We create a "feature name" via: - for 'source' features use 'nuc' - for 'CDS' features use the locus_tag or the gene. If neither, then silently ignore. + - for 'gene' feature use the locus_tag or the gene. If neither, then silently ignore. Parameters ---------- reference : string File path to GenBank reference feature_names : None or set or list - Restrict the CDSs we read to those in the set/list + Restrict the CDSs or genes we read to those in the set/list Returns ------- @@ -408,6 +409,7 @@ def _read_genbank(reference, feature_names): AugurError If 'nuc' annotation not parsed If a CDS feature is given the name 'nuc' + If a 'gene' feature is given the name 'nuc' """ from Bio import SeqIO gb = SeqIO.read(reference, 'genbank') @@ -432,6 +434,21 @@ def _read_genbank(reference, feature_names): if fname and (feature_names is None or fname in feature_names): features[fname] = feat + if feat.type=='gene': + fname = None + if "locus_tag" in feat.qualifiers: + fname = feat.qualifiers["locus_tag"][0] + elif "gene" in feat.qualifiers: + fname = feat.qualifiers["gene"][0] + else: + features_skipped+=1 + + if fname == 'nuc': + raise AugurError(f"Reference {reference!r} contains a 'gene' feature with the name 'nuc'. This is not allowed.") + + if fname and (feature_names is None or fname in feature_names): + features[fname] = feat + if features_skipped: print(f"WARNING: {features_skipped} CDS features skipped as they didn't have a locus_tag or gene qualifier.") From 121e304bb781965ddb72f88e670813871d263b94 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 22 Mar 2024 15:30:12 -0700 Subject: [PATCH 2/2] wip: prefer CDS over gene --- augur/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/augur/utils.py b/augur/utils.py index d6f3ab466..bcf307766 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -447,7 +447,10 @@ def _read_genbank(reference, feature_names): raise AugurError(f"Reference {reference!r} contains a 'gene' feature with the name 'nuc'. This is not allowed.") if fname and (feature_names is None or fname in feature_names): - features[fname] = feat + if fname not in features: + features[fname] = feat + else: + print(f"WARNING: gene: '{fname}' already exists as a CDS: '{fname}'. Skipping this feature.") if features_skipped: print(f"WARNING: {features_skipped} CDS features skipped as they didn't have a locus_tag or gene qualifier.")