Refine and document regex for amplicon grouping (galaxyproject#3891)

The ivar trim and ivar remove_reads wrappers can infer amplicons from primer names and this change makes the inference more robust and documents the regex used in the tool wrapper help sections.
davebx · Aug 20, 2021 · f09d0be · f09d0be
1 parent 00ddca1
commit f09d0be
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 32 deletions.
diff --git a/tools/ivar/ivar_removereads.xml b/tools/ivar/ivar_removereads.xml
@@ -1,4 +1,4 @@
-<tool id="ivar_removereads" name="ivar removereads" version="@VERSION@+galaxy1">
+<tool id="ivar_removereads" name="ivar removereads" version="@VERSION@+galaxy2">
     <description>Remove reads from trimmed BAM file</description>
     <macros>
         <import>macros.xml</import>
@@ -38,7 +38,7 @@
         <param name="input_bed" argument="-b" type="data" format="bed" label="Primer binding sites information"
         help="The same six-column BED dataset that served as input to ivar trim"/>
         <conditional name="amplicons">
-            <param name="computed" type="select" label="Compute amplicon info from BED file" help="Compute the amplicon info file from the primer BED file">
+            <param name="computed" type="select" label="Compute amplicon info from BED file" help="For suitable primer binding site datasets amplicon info can be computed directly (see tool help below). For others you will need to provide an extra amplicon info dataset.">
                 <option value="yes" selected="true">Yes</option>
                 <option value="no">No</option>
             </param>
@@ -86,6 +86,23 @@ trimmed with ``ivar trim``.
 From this input it will remove reads that come from amplicons that have been
 generated with one or more primers that may have been affected in their binding
 by variants listed in the variants input file.
+To do its job, the needs to know which primers work together to form an
+amplicon. The tool can try to deduce this info from the names of the primers
+found in the primer info dataset. This will require a primer naming scheme
+following the regex pattern::
+
+  .*_(?P<amplicon_number>\d+).*_(?P<primer_orientation>L(?:EFT)?|R(?:IGHT)?)
+
+*i.e.*, the following schemes will work (and get parsed as):
+
+- ``nCoV-2019_1_LEFT`` (forward primer of amplicon 1)
+- ``400_2_out_R`` (reverse primer of amplicon 2)
+- ``QIAseq_163-2_LEFT`` (forward primer of amplicon 163)
+
+Alternatively, you can specify the amplicon information explicitly through a
+dataset that lists the names of primers that together form any given amplicon.
+In it, primer names (exactly matching those in the primer info dataset) need to
+be TAB-separated with one line per amplicon.
 
 .. class:: Warning mark
 

diff --git a/tools/ivar/ivar_trim.xml b/tools/ivar/ivar_trim.xml
@@ -1,4 +1,4 @@
-<tool id="ivar_trim" name="ivar trim" version="@VERSION@+galaxy1">
+<tool id="ivar_trim" name="ivar trim" version="@VERSION@+galaxy2">
     <description>Trim reads in aligned BAM</description>
     <macros>
         <import>macros.xml</import>
@@ -61,7 +61,7 @@
         <conditional name="amplicons">
             <param name="filter_by" type="select"
             label="Filter reads based on amplicon info"
-            help="When you select Yes reads that are not fully contained in any amplicon will be dropped before primer trimming. Info on amplicons can be computed from the primer BED file or provided by the user. This option is currently marked as [Experimental] in ivar, but nevertheless recommended here.">
+            help="When you select Yes, reads that are not fully contained in any amplicon will be dropped before primer trimming. This option is currently marked as [Experimental] in ivar, but nevertheless recommended here. Info on amplicons can be computed from suitable primer BED files (see tool help below) or provided by the user. ">
                 <option value="">No, allow reads to extend beyond amplicon boundaries</option>
                 <option value="yes_compute">Yes, drop reads that extend beyond amplicon boundaries</option>
                 <option value="yes">Yes, drop reads that extend beyond amplicon boundaries and use my amplicon info file</option>
@@ -150,10 +150,25 @@ soft-clipped.
 Optionally, the tool can also discard reads that do not fully map to within any
 amplicon. Such reads are likely to be wet-lab or mapping artefacts and removing
 them can increase variant calling precision. To calculate the extent of
-expected amplicons the tool requires an additional amplicon info dataset that
-lists the names of primers that together form any given amplicon. Primer names
-(exactly matching those in the primer info dataset) need to be TAB-separated
-with one line per amplicon.
+expected amplicons the tool needs to know which primers work together to form
+an amplicon. The tool can try to deduce this info from the names of the primers
+found in the primer info dataset. This will require a primer naming scheme
+following the regex pattern::
+
+  .*_(?P<amplicon_number>\d+).*_(?P<primer_orientation>L(?:EFT)?|R(?:IGHT)?)
+
+*i.e.*, the following schemes will work (and get parsed as):
+
+- ``nCoV-2019_1_LEFT`` (forward primer of amplicon 1)
+
+- ``400_2_out_R`` (reverse primer of amplicon 2)
+
+- ``QIAseq_163-2_LEFT`` (forward primer of amplicon 163)
+
+Alternatively, you can specify the amplicon information explicitly through a
+dataset that lists the names of primers that together form any given amplicon.
+In it, primer names (exactly matching those in the primer info dataset) need to
+be TAB-separated with one line per amplicon.
 If the primer scheme has more than two primers contributing to a given amplicon
 (in schemes using alternate primers), you can (in this Galaxy tool only)
 specify all of them on one line and the tool will calculate the maximum extent

diff --git a/tools/ivar/write_amplicon_info_file.py b/tools/ivar/write_amplicon_info_file.py
@@ -3,48 +3,49 @@
 import argparse
 import re
 
-AMPLICON_NAME_RE = r'.*_(?P<num>\d+)_[^0-9]*(?P<name>L(?:EFT)?|R(?:IGHT)?)'
 
-
-def primer_info_to_position(name):
-    position = 0
-    re_match = re.match(AMPLICON_NAME_RE, name)
-    if re_match is None:
-        raise ValueError("{} does not match expected amplicon name format".format(name))
-    side = re_match.group('name')
-    num = re_match.group('num')
-    if side == 'RIGHT' or side == 'R':
-        position += 1000
-    if num is not None:
-        position += int(num)
-    return position
+AMPLICON_PAT = re.compile(r'.*_(?P<num>\d+).*_(?P<name>L(?:EFT)?|R(?:IGHT)?)')
 
 
 def write_amplicon_info_file(bed_file, amplicon_info_file):
     amplicon_sets = {}
-    amplicon_ids = set()
     for line in bed_file:
         fields = line.strip().split('\t')
+        start = int(fields[1])
         name = fields[3]
-        re_match = re.match(AMPLICON_NAME_RE, name)
+        re_match = AMPLICON_PAT.match(name)
         if re_match is None:
-            raise ValueError("{} does not match expected amplicon name format".format(name))
+            raise ValueError(
+                '{} does not match expected amplicon name format'.format(name)
+            )
         amplicon_id = int(re_match.group('num'))
         amplicon_set = amplicon_sets.get(amplicon_id, [])
-        amplicon_set.append(name)
+        amplicon_set.append((name, start))
         amplicon_sets[amplicon_id] = amplicon_set
-        amplicon_ids.add(amplicon_id)
 
-    for id in sorted(list(amplicon_ids)):
-        amplicon_info = '\t'.join([name for name in sorted(amplicon_sets[id], key=primer_info_to_position)]) + '\n'
+    # write amplicons sorted by number with primers sorted by start position
+    for id in sorted(amplicon_sets):
+        amplicon_info = '\t'.join(
+            [name for name, start in sorted(
+                amplicon_sets[id], key=lambda x: x[1]
+            )]
+        ) + '\n'
         amplicon_info_file.write(amplicon_info)
     amplicon_info_file.close()
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Write an amplicon info file for iVar from a BED file describing primer positions')
-    parser.add_argument('bed_file', type=argparse.FileType(), help='Primer BED file')
-    parser.add_argument('amplicon_info_file', type=argparse.FileType('w'), help='Output file: amplicon info file in TSV format')
+    parser = argparse.ArgumentParser(
+        description='Write an amplicon info file for iVar '
+                    'from a BED file describing primer positions'
+    )
+    parser.add_argument(
+        'bed_file', type=argparse.FileType(), help='Primer BED file'
+    )
+    parser.add_argument(
+        'amplicon_info_file', type=argparse.FileType('w'),
+        help='Output file: amplicon info file in TSV format'
+    )
     args = parser.parse_args()
 
     write_amplicon_info_file(args.bed_file, args.amplicon_info_file)