Write working data to disk in aln2counts for issue #393.

Add a utility for sorting SAM files. Add third-party license information to README.
cfe-lab · Jul 11, 2017 · 5e5c20d · 5e5c20d
1 parent 1dde5ce
commit 5e5c20d
Show file tree

Hide file tree

Showing 8 changed files with 271 additions and 48 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -272,20 +272,19 @@ and the MiseqQCReport so you can download QC data and upload results.
 
 ### Looking at SAM files ###
 When you don't understand the pipeline's output, it can be helpful to look at
-the raw reads in a sequence viewer like [Tablet][tablet]. Change the settings
-file on your workstation to not delete the temp folders, then run the pipeline
-on a run with a single sample. Look through the temp folders to find the one
-for the step you're interested in. For the remap step, the final remap results
-are stored in a SAM file named for the seed region. You also have to edit the
-`cfe.fasta` file and rename that seed region to "0", because the SAM file uses
-the region name "0" instead of the proper region name. Once you've done that,
-you should be able to open an assembly in Tablet using the SAM file and the
-edited FASTA file. If the SAM file contains multiple regions, you'll probably
-have to sort it:
-
-    samtools view -Sb example.sam > example.bam
-    samtools sort example.bam example.sorted
-    samtools view -h -o example.sorted.sam example.sorted.bam
+the raw reads in a sequence viewer like [Tablet][tablet]. Run the micall_basespace
+script on a run with a single sample, like this:
+
+    python micall_basespace.py --debug_remap --all_projects --link_run /path/to/run /working/path
+
+The options tell it to write the debug files, use all projects, link to the run
+with the sample you're interested in, and put all the working files in the 
+given folder. Look through the scratch folders under the working path to find
+the one for the sample you're interested in. The remap step writes the mapping
+results as `debug_remapX_debug.sam` and `debug_remapX_debug_ref.fasta`, where
+`X` is the remapping iteration number. You should be able to open an assembly
+in Tablet using those two files. If the SAM file contains multiple regions,
+you'll probably have to sort it with the `micall/utils/sort_sam.py` script.
 
 [tablet]: http://ics.hutton.ac.uk/tablet/
 

diff --git a/README.md b/README.md
@@ -34,7 +34,25 @@ example, if you want to run a changed version of the program on a network server
 without publishing the changed source code, [contact us][contact] about
 purchasing a license.
 
+## Third Party Components ##
+MiCall makes use of several open-source tools. Here is a list of tools with
+their licenses.
+
+Requests is distributed under the Apache 2.0 license.
+
+Python 3.4 is distributed under the [Python 3.4 license][python].
+
+Bowtie2 and Python-Levenshtein are distributed under the GNU General Public License (GPL).
+
+Matplotlib is distributed under the [Matplotlib license][matplotlib].
+
+Reportlab is distributed under the BSD license.
+
+Pyyaml and Cutadapt are distributed under the MIT license.
+
+
 [gnu]: http://www.gnu.org/licenses/
 [github]: https://github.com/cfe-lab/MiCall
 [contact]: http://www.google.com/recaptcha/mailhide/d?k=01yIEKHPqcNIG1ecF-Khum4g==&c=SnES_swjOvb0d22WN4Q30vx9gKyzHNDkTxbY6Y1os4w=
-
+[python]: https://docs.python.org/3.4/license.html
+[matplotlib]: http://matplotlib.org/users/license.html
diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py
@@ -22,6 +22,7 @@
 
 from micall.core import miseq_logging
 from micall.core import project_config
+from micall.utils.big_counter import BigCounter
 from micall.utils.translation import translate, ambig_dict
 
 AMINO_ALPHABET = 'ACDEFGHIKLMNPQRSTVWY*'
@@ -128,7 +129,7 @@ def enable_callback(self, callback, file_size):
 
         self.callback = callback
         self.callback_max = file_size
-        self.callback_chunk_size = file_size / 100
+        self.callback_chunk_size = file_size // 100
         self.callback_next = self.callback_chunk_size
         self.callback(message='... extracting statistics from alignments',
                       progress=0,
@@ -138,17 +139,14 @@ def _count_reads(self, aligned_reads):
         """
         Parses contents of aligned CSV.
 
-        :param aligned_reads: a List of Dicts from csv.DictReader
+        :param aligned_reads: a sequence of Dicts from csv.DictReader
         """
 
-        # skip everything if aligned_reads is empty
-        if len(aligned_reads) > 0:
-            # these will be the same for all rows, so just assign from the first
-            first_row = aligned_reads[0]
-            self.seed = first_row['refname']
-            self.qcut = first_row['qcut']
-
-        for row in aligned_reads:
+        for i, row in enumerate(aligned_reads):
+            if i == 0:
+                # these will be the same for all rows, so just assign from the first
+                self.seed = row['refname']
+                self.qcut = row['qcut']
             nuc_seq = row['seq']
             offset = int(row['offset'])
             count = int(row['count'])
@@ -249,8 +247,8 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
             self.reading_frames[coordinate_name] = reading_frame
             self.consensus[coordinate_name] = consensus
             coordinate_inserts = {i*3 - reading_frame for i in range(len(consensus))}
-            self.inserts[coordinate_name] = coordinate_inserts
             prev_conseq_index = None
+            prev_consensus_nuc_index = None
             for coord_index in range(len(coordinate_ref)):
                 conseq_index = coord2conseq.get(coord_index)
                 if conseq_index is None:
@@ -269,6 +267,10 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
                             nuc.count_nucleotides('-', min_count)
                 else:
                     seed_amino = frame_seed_aminos[conseq_index]
+                    if prev_conseq_index is None:
+                        coordinate_inserts = {i
+                                              for i in coordinate_inserts
+                                              if i >= seed_amino.consensus_nuc_index}
                     prev_conseq_index = conseq_index
                 if (coordinate_name == self.g2p_overlap_region_name and
                         self.g2p_overlap_aminos):
@@ -278,6 +280,14 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
                 report_aminos.append(ReportAmino(seed_amino, coord_index + 1))
                 if seed_amino.consensus_nuc_index is not None:
                     coordinate_inserts.remove(seed_amino.consensus_nuc_index)
+                    prev_consensus_nuc_index = seed_amino.consensus_nuc_index
+            if prev_consensus_nuc_index is None:
+                coordinate_inserts.clear()
+            else:
+                coordinate_inserts = {i
+                                      for i in coordinate_inserts
+                                      if i <= prev_consensus_nuc_index}
+            self.inserts[coordinate_name] = coordinate_inserts
 
         self.reports[coordinate_name] = report_aminos
 
@@ -637,14 +647,13 @@ def align_deletions(self, aligned_reads):
 
         :param aligned_reads: group of rows from the CSV reader of the aligned
             reads.
-        :return: a list of rows from the CSV reader with codon deletions
+        :return: a generator of rows from the CSV reader with codon deletions
             aligned to the codon boundaries.
         """
-        result = list(aligned_reads)
         if self.remap_conseqs is None:
-            return result
+            yield from aligned_reads
         reading_frames = None
-        for row in result:
+        for row in aligned_reads:
             if reading_frames is None:
                 reading_frames = self.load_reading_frames(row['refname'])
             seq = row['seq']
@@ -672,7 +681,7 @@ def align_deletions(self, aligned_reads):
                     chars.insert(pos, '-')
                 seq = ''.join(chars)
                 row['seq'] = seq
-        return result
+            yield row
 
     def load_reading_frames(self, seed_name):
         """ Calculate reading frames along a consensus sequence.
@@ -889,7 +898,19 @@ def __init__(self, insert_file):
 
         # {(seed, region): {pos: insert_count}}
         self.insert_pos_counts = defaultdict(Counter)
-        self.seed = self.qcut = self.nuc_seqs = None
+        self.seed = self.qcut = None
+        self.insert_file_name = getattr(insert_file, 'name', None)
+        self.nuc_seqs = self.create_counter('nuc_read_counts')  # {nuc_seq: count}
+
+    def create_counter(self, base_name):
+        if self.insert_file_name is None:
+            counter = Counter()
+        else:
+            dirname = os.path.dirname(self.insert_file_name)
+            file_prefix = os.path.join(os.path.abspath(dirname),
+                                       base_name)
+            counter = BigCounter(file_prefix=file_prefix)
+        return counter
 
     def start_group(self, seed, qcut):
         """ Start a new group of reads.
@@ -899,7 +920,7 @@ def start_group(self, seed, qcut):
         """
         self.seed = seed
         self.qcut = qcut
-        self.nuc_seqs = Counter()  # {nuc_seq: count}
+        self.nuc_seqs.clear()
 
     def add_nuc_read(self, offset_sequence, count):
         """ Add a read to the group.
@@ -963,7 +984,7 @@ def write(self, inserts, region, report_aminos=None):
 
         # record insertions to CSV
         for left, counts in insert_counts.items():
-            for insert_seq, count in counts.items():
+            for insert_seq, count in counts.most_common():
                 insert_before = insert_targets.get(left)
                 # Only care about insertions in the middle of the sequence,
                 # so ignore any that come before or after the reference.

diff --git a/micall/tests/aln2counts_test.py b/micall/tests/aln2counts_test.py
@@ -1405,6 +1405,41 @@ def testInsertionBetweenSeedAndCoordinateNucleotideReport(self):
 
         self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
 
+    def testInsertionsSortedByCount(self):
+        # refname,qcut,rank,count,offset,seq
+        aligned_reads = self.prepareReads("""\
+R3-seed,15,0,9,0,CATGAGCGAAAATTTCAGACTGGGCCCCGAGAGCATCAGTTTAAA
+R3-seed,15,0,8,0,CATGAGCGAAAATTTCAGACTAAACCCCGAGAGCATCAGTTTAAA
+""")
+        expected_insertions = """\
+seed,region,qcut,left,insert,count,before
+R3-seed,R3,15,22,G,9,5
+R3-seed,R3,15,22,K,8,5
+"""
+
+        self.report.read(aligned_reads)
+        self.report.write_insertions()
+
+        self.assertMultiLineEqual(expected_insertions,
+                                  self.insertion_file.getvalue())
+
+    def testInsertionsSortedByLeft(self):
+        # refname,qcut,rank,count,offset,seq
+        aligned_reads = self.prepareReads("""\
+R3-seed,15,0,9,0,CATGAGCGAAAATTTCAGACTGGGCCCCGAAAAGAGCATCAGTTTAAA
+""")
+        expected_insertions = """\
+seed,region,qcut,left,insert,count,before
+R3-seed,R3,15,22,G,9,5
+R3-seed,R3,15,31,K,9,7
+"""
+
+        self.report.read(aligned_reads)
+        self.report.write_insertions()
+
+        self.assertMultiLineEqual(expected_insertions,
+                                  self.insertion_file.getvalue())
+
     def testInsertionInDifferentReadingFrame(self):
         """ Delete part of the first codon to throw off the reading frame.
         """
@@ -1852,7 +1887,7 @@ def testAlignDeletionsWithoutDeletion(self):
                                offset='0',
                                seq='AAATTTAGG')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -1867,7 +1902,7 @@ def testAlignDeletionsNoChange(self):
                                offset='0',
                                seq='AAA---AGG')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -1882,7 +1917,7 @@ def testAlignDeletionsShiftedRight(self):
                                offset='0',
                                seq='AAA---AGG')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -1897,7 +1932,7 @@ def testAlignDeletionsShiftedLeft(self):
                                offset='0',
                                seq='AAA---AGG')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -1912,7 +1947,7 @@ def testAlignDeletionsTwoCodons(self):
                                offset='0',
                                seq='AAA------CCGAGA')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -1927,7 +1962,7 @@ def testAlignDeletionsUsingOffset(self):
                                offset='1',
                                seq='AA---AGG')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -1967,7 +2002,7 @@ def testAlignDeletionsUsingReadingFrame1(self):
                                offset='2',
                                seq='AAA---AGG')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -2016,7 +2051,7 @@ def testAlignDeletionsMultipleReadingFrames(self):
                                offset='1',
                                seq='AAA---CAGTTTTTTTT---CAGCAT')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -2031,7 +2066,7 @@ def testCombineDeletions(self):
                                offset='0',
                                seq='AATCGC---CCGAGA')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -2046,7 +2081,7 @@ def testCombineDeletionsTwoCodons(self):
                                offset='0',
                                seq='AAT------CCGAGA')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -2061,7 +2096,7 @@ def testCombineDeletionsMaxSpread(self):
                                offset='0',
                                seq='AAT---TCAGACCCCCGAGAGCAT')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -2076,7 +2111,7 @@ def testCombineDeletionsBeyondMaxSpread(self):
                                offset='0',
                                seq='AA--TTCAGACCCCA-GAGAGCAT')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)
 
@@ -2095,7 +2130,7 @@ def testCombineDeletionsTooCrowded(self):
                                offset='0',
                                seq='AA--TTCAGACCCC-CGA---CAT')]
 
-        reads = self.report.align_deletions(aligned_reads)
+        reads = list(self.report.align_deletions(aligned_reads))
 
         self.assertEqual(expected_reads, reads)