Skip to content

Commit

Permalink
Write working data to disk in aln2counts for issue #393.
Browse files Browse the repository at this point in the history
Add a utility for sorting SAM files.
Add third-party license information to README.
  • Loading branch information
donkirkby committed Jul 11, 2017
1 parent 1dde5ce commit 5e5c20d
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 48 deletions.
27 changes: 13 additions & 14 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,20 +272,19 @@ and the MiseqQCReport so you can download QC data and upload results.

### Looking at SAM files ###
When you don't understand the pipeline's output, it can be helpful to look at
the raw reads in a sequence viewer like [Tablet][tablet]. Change the settings
file on your workstation to not delete the temp folders, then run the pipeline
on a run with a single sample. Look through the temp folders to find the one
for the step you're interested in. For the remap step, the final remap results
are stored in a SAM file named for the seed region. You also have to edit the
`cfe.fasta` file and rename that seed region to "0", because the SAM file uses
the region name "0" instead of the proper region name. Once you've done that,
you should be able to open an assembly in Tablet using the SAM file and the
edited FASTA file. If the SAM file contains multiple regions, you'll probably
have to sort it:

samtools view -Sb example.sam > example.bam
samtools sort example.bam example.sorted
samtools view -h -o example.sorted.sam example.sorted.bam
the raw reads in a sequence viewer like [Tablet][tablet]. Run the micall_basespace
script on a run with a single sample, like this:

python micall_basespace.py --debug_remap --all_projects --link_run /path/to/run /working/path

The options tell it to write the debug files, use all projects, link to the run
with the sample you're interested in, and put all the working files in the
given folder. Look through the scratch folders under the working path to find
the one for the sample you're interested in. The remap step writes the mapping
results as `debug_remapX_debug.sam` and `debug_remapX_debug_ref.fasta`, where
`X` is the remapping iteration number. You should be able to open an assembly
in Tablet using those two files. If the SAM file contains multiple regions,
you'll probably have to sort it with the `micall/utils/sort_sam.py` script.

[tablet]: http://ics.hutton.ac.uk/tablet/

Expand Down
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,25 @@ example, if you want to run a changed version of the program on a network server
without publishing the changed source code, [contact us][contact] about
purchasing a license.

## Third Party Components ##
MiCall makes use of several open-source tools. Here is a list of tools with
their licenses.

Requests is distributed under the Apache 2.0 license.

Python 3.4 is distributed under the [Python 3.4 license][python].

Bowtie2 and Python-Levenshtein are distributed under the GNU General Public License (GPL).

Matplotlib is distributed under the [Matplotlib license][matplotlib].

Reportlab is distributed under the BSD license.

Pyyaml and Cutadapt are distributed under the MIT license.


[gnu]: http://www.gnu.org/licenses/
[github]: https://github.com/cfe-lab/MiCall
[contact]: http://www.google.com/recaptcha/mailhide/d?k=01yIEKHPqcNIG1ecF-Khum4g==&c=SnES_swjOvb0d22WN4Q30vx9gKyzHNDkTxbY6Y1os4w=

[python]: https://docs.python.org/3.4/license.html
[matplotlib]: http://matplotlib.org/users/license.html
59 changes: 40 additions & 19 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from micall.core import miseq_logging
from micall.core import project_config
from micall.utils.big_counter import BigCounter
from micall.utils.translation import translate, ambig_dict

AMINO_ALPHABET = 'ACDEFGHIKLMNPQRSTVWY*'
Expand Down Expand Up @@ -128,7 +129,7 @@ def enable_callback(self, callback, file_size):

self.callback = callback
self.callback_max = file_size
self.callback_chunk_size = file_size / 100
self.callback_chunk_size = file_size // 100
self.callback_next = self.callback_chunk_size
self.callback(message='... extracting statistics from alignments',
progress=0,
Expand All @@ -138,17 +139,14 @@ def _count_reads(self, aligned_reads):
"""
Parses contents of aligned CSV.
:param aligned_reads: a List of Dicts from csv.DictReader
:param aligned_reads: a sequence of Dicts from csv.DictReader
"""

# skip everything if aligned_reads is empty
if len(aligned_reads) > 0:
# these will be the same for all rows, so just assign from the first
first_row = aligned_reads[0]
self.seed = first_row['refname']
self.qcut = first_row['qcut']

for row in aligned_reads:
for i, row in enumerate(aligned_reads):
if i == 0:
# these will be the same for all rows, so just assign from the first
self.seed = row['refname']
self.qcut = row['qcut']
nuc_seq = row['seq']
offset = int(row['offset'])
count = int(row['count'])
Expand Down Expand Up @@ -249,8 +247,8 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
self.reading_frames[coordinate_name] = reading_frame
self.consensus[coordinate_name] = consensus
coordinate_inserts = {i*3 - reading_frame for i in range(len(consensus))}
self.inserts[coordinate_name] = coordinate_inserts
prev_conseq_index = None
prev_consensus_nuc_index = None
for coord_index in range(len(coordinate_ref)):
conseq_index = coord2conseq.get(coord_index)
if conseq_index is None:
Expand All @@ -269,6 +267,10 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
nuc.count_nucleotides('-', min_count)
else:
seed_amino = frame_seed_aminos[conseq_index]
if prev_conseq_index is None:
coordinate_inserts = {i
for i in coordinate_inserts
if i >= seed_amino.consensus_nuc_index}
prev_conseq_index = conseq_index
if (coordinate_name == self.g2p_overlap_region_name and
self.g2p_overlap_aminos):
Expand All @@ -278,6 +280,14 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
report_aminos.append(ReportAmino(seed_amino, coord_index + 1))
if seed_amino.consensus_nuc_index is not None:
coordinate_inserts.remove(seed_amino.consensus_nuc_index)
prev_consensus_nuc_index = seed_amino.consensus_nuc_index
if prev_consensus_nuc_index is None:
coordinate_inserts.clear()
else:
coordinate_inserts = {i
for i in coordinate_inserts
if i <= prev_consensus_nuc_index}
self.inserts[coordinate_name] = coordinate_inserts

self.reports[coordinate_name] = report_aminos

Expand Down Expand Up @@ -637,14 +647,13 @@ def align_deletions(self, aligned_reads):
:param aligned_reads: group of rows from the CSV reader of the aligned
reads.
:return: a list of rows from the CSV reader with codon deletions
:return: a generator of rows from the CSV reader with codon deletions
aligned to the codon boundaries.
"""
result = list(aligned_reads)
if self.remap_conseqs is None:
return result
yield from aligned_reads
reading_frames = None
for row in result:
for row in aligned_reads:
if reading_frames is None:
reading_frames = self.load_reading_frames(row['refname'])
seq = row['seq']
Expand Down Expand Up @@ -672,7 +681,7 @@ def align_deletions(self, aligned_reads):
chars.insert(pos, '-')
seq = ''.join(chars)
row['seq'] = seq
return result
yield row

def load_reading_frames(self, seed_name):
""" Calculate reading frames along a consensus sequence.
Expand Down Expand Up @@ -889,7 +898,19 @@ def __init__(self, insert_file):

# {(seed, region): {pos: insert_count}}
self.insert_pos_counts = defaultdict(Counter)
self.seed = self.qcut = self.nuc_seqs = None
self.seed = self.qcut = None
self.insert_file_name = getattr(insert_file, 'name', None)
self.nuc_seqs = self.create_counter('nuc_read_counts') # {nuc_seq: count}

def create_counter(self, base_name):
if self.insert_file_name is None:
counter = Counter()
else:
dirname = os.path.dirname(self.insert_file_name)
file_prefix = os.path.join(os.path.abspath(dirname),
base_name)
counter = BigCounter(file_prefix=file_prefix)
return counter

def start_group(self, seed, qcut):
""" Start a new group of reads.
Expand All @@ -899,7 +920,7 @@ def start_group(self, seed, qcut):
"""
self.seed = seed
self.qcut = qcut
self.nuc_seqs = Counter() # {nuc_seq: count}
self.nuc_seqs.clear()

def add_nuc_read(self, offset_sequence, count):
""" Add a read to the group.
Expand Down Expand Up @@ -963,7 +984,7 @@ def write(self, inserts, region, report_aminos=None):

# record insertions to CSV
for left, counts in insert_counts.items():
for insert_seq, count in counts.items():
for insert_seq, count in counts.most_common():
insert_before = insert_targets.get(left)
# Only care about insertions in the middle of the sequence,
# so ignore any that come before or after the reference.
Expand Down
61 changes: 48 additions & 13 deletions micall/tests/aln2counts_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,41 @@ def testInsertionBetweenSeedAndCoordinateNucleotideReport(self):

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testInsertionsSortedByCount(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
R3-seed,15,0,9,0,CATGAGCGAAAATTTCAGACTGGGCCCCGAGAGCATCAGTTTAAA
R3-seed,15,0,8,0,CATGAGCGAAAATTTCAGACTAAACCCCGAGAGCATCAGTTTAAA
""")
expected_insertions = """\
seed,region,qcut,left,insert,count,before
R3-seed,R3,15,22,G,9,5
R3-seed,R3,15,22,K,8,5
"""

self.report.read(aligned_reads)
self.report.write_insertions()

self.assertMultiLineEqual(expected_insertions,
self.insertion_file.getvalue())

def testInsertionsSortedByLeft(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
R3-seed,15,0,9,0,CATGAGCGAAAATTTCAGACTGGGCCCCGAAAAGAGCATCAGTTTAAA
""")
expected_insertions = """\
seed,region,qcut,left,insert,count,before
R3-seed,R3,15,22,G,9,5
R3-seed,R3,15,31,K,9,7
"""

self.report.read(aligned_reads)
self.report.write_insertions()

self.assertMultiLineEqual(expected_insertions,
self.insertion_file.getvalue())

def testInsertionInDifferentReadingFrame(self):
""" Delete part of the first codon to throw off the reading frame.
"""
Expand Down Expand Up @@ -1852,7 +1887,7 @@ def testAlignDeletionsWithoutDeletion(self):
offset='0',
seq='AAATTTAGG')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -1867,7 +1902,7 @@ def testAlignDeletionsNoChange(self):
offset='0',
seq='AAA---AGG')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -1882,7 +1917,7 @@ def testAlignDeletionsShiftedRight(self):
offset='0',
seq='AAA---AGG')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -1897,7 +1932,7 @@ def testAlignDeletionsShiftedLeft(self):
offset='0',
seq='AAA---AGG')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -1912,7 +1947,7 @@ def testAlignDeletionsTwoCodons(self):
offset='0',
seq='AAA------CCGAGA')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -1927,7 +1962,7 @@ def testAlignDeletionsUsingOffset(self):
offset='1',
seq='AA---AGG')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand Down Expand Up @@ -1967,7 +2002,7 @@ def testAlignDeletionsUsingReadingFrame1(self):
offset='2',
seq='AAA---AGG')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand Down Expand Up @@ -2016,7 +2051,7 @@ def testAlignDeletionsMultipleReadingFrames(self):
offset='1',
seq='AAA---CAGTTTTTTTT---CAGCAT')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -2031,7 +2066,7 @@ def testCombineDeletions(self):
offset='0',
seq='AATCGC---CCGAGA')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -2046,7 +2081,7 @@ def testCombineDeletionsTwoCodons(self):
offset='0',
seq='AAT------CCGAGA')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -2061,7 +2096,7 @@ def testCombineDeletionsMaxSpread(self):
offset='0',
seq='AAT---TCAGACCCCCGAGAGCAT')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -2076,7 +2111,7 @@ def testCombineDeletionsBeyondMaxSpread(self):
offset='0',
seq='AA--TTCAGACCCCA-GAGAGCAT')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand All @@ -2095,7 +2130,7 @@ def testCombineDeletionsTooCrowded(self):
offset='0',
seq='AA--TTCAGACCCC-CGA---CAT')]

reads = self.report.align_deletions(aligned_reads)
reads = list(self.report.align_deletions(aligned_reads))

self.assertEqual(expected_reads, reads)

Expand Down
Loading

0 comments on commit 5e5c20d

Please sign in to comment.