Skip to content

Commit

Permalink
Exclude reads that have been trimmed to nothing, as part of #530.
Browse files Browse the repository at this point in the history
Also upgrade to Python 3.8.
  • Loading branch information
donkirkby committed Jun 17, 2020
1 parent 92681a3 commit c3cd591
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 29 deletions.
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,14 @@
# If you omit the `--target` tag altogether, `docker build` will build
# the development image.

FROM python:3.7 as production
FROM python:3.8 as production

MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall

## Prerequisites
RUN apt-get update -qq --fix-missing && apt-get install -qq -y \
unzip \
wget \
libpython3.7-dev \
&& rm -rf /var/lib/apt/lists/*

RUN wget -qO rustup.sh https://sh.rustup.rs && \
Expand Down
8 changes: 4 additions & 4 deletions Singularity
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@ From: centos:7
yum install -q -y unzip wget fontconfig bzip2-devel xz-devel openssl-devel libffi-devel

echo ===== Installing Python ===== >/dev/null
wget -q https://www.python.org/ftp/python/3.7.6/Python-3.7.6.tgz
tar xzf Python*
rm Python*.tgz
wget -q https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tar.xz
tar xJf Python*
rm Python*.xz
cd Python*
./configure --enable-optimizations
make altinstall
cd ..
rm -rf Python*
ln -s /usr/local/bin/python3.7 /usr/local/bin/python3
ln -s /usr/local/bin/python3.8 /usr/local/bin/python3

echo ===== Installing Rust and merge-mates ===== >/dev/null
yum install -q -y rust cargo
Expand Down
52 changes: 29 additions & 23 deletions micall/core/trim_fastqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ def parse_args():
return parser.parse_args()


def trim(original_fastq_filenames,
bad_cycles_filename,
trimmed_fastq_filenames,
use_gzip=True,
summary_file=None,
def trim(original_fastq_filenames: typing.Sequence[str],
bad_cycles_filename: str,
trimmed_fastq_filenames: typing.Sequence[str],
use_gzip: bool = True,
summary_file: typing.TextIO = None,
skip: typing.Tuple[str] = ()):
"""
Expand Down Expand Up @@ -95,14 +95,14 @@ def trim(original_fastq_filenames,
cycle_sign = 1 - 2*i
with open(src_name, 'rb') as src, open(dest_name, 'w') as dest:
censor(src, bad_cycles, dest, use_gzip, summary_writer, cycle_sign)
logger.info('Finished censoring in %s for %s.',
datetime.now() - start_time,
trimmed_fastq_filenames[0])

cut_all(censored_filenames[0],
censored_filenames[1],
trimmed_fastq_filenames[0],
trimmed_fastq_filenames[1],
logger.debug('Finished censoring in %s for %s.',
datetime.now() - start_time,
trimmed_fastq_filenames[0])

cut_all(Path(censored_filenames[0]),
Path(censored_filenames[1]),
Path(trimmed_fastq_filenames[0]),
Path(trimmed_fastq_filenames[1]),
skip)
purge_temp_files(censored_filenames)

Expand All @@ -127,9 +127,9 @@ def cut_all(censored_fastq1: Path,
censored_fastq2,
dedapted_filenames[0],
dedapted_filenames[1])
logger.info('Trimmed adapters in %s for %s.',
datetime.now() - start_time,
trimmed_fastq1)
logger.debug('Trimmed adapters in %s for %s.',
datetime.now() - start_time,
trimmed_fastq1)
if TrimSteps.primers in skip:
shutil.copy(str(dedapted_filenames[0]), str(trimmed_fastq1))
shutil.copy(str(dedapted_filenames[1]), str(trimmed_fastq2))
Expand Down Expand Up @@ -246,13 +246,19 @@ def combine_primer_trimming(original_fastq1: Path,
rtrimmed_fastq2: Path,
trimmed_fastq1: Path,
trimmed_fastq2: Path):
for original_fastq, start_trimmed_fastq, end_trimmed_fastq, trimmed_fastq in (
(original_fastq1, ltrimmed_fastq1, rtrimmed_fastq1, trimmed_fastq1),
(original_fastq2, rtrimmed_fastq2, ltrimmed_fastq2, trimmed_fastq2)):
trimmed_sequences = cut_primer_dimer_sequences(original_fastq,
start_trimmed_fastq,
end_trimmed_fastq)
SeqIO.write(trimmed_sequences, trimmed_fastq, 'fastq')
trimmed_sequences1 = cut_primer_dimer_sequences(
original_fastq1,
start_trimmed_fastq=ltrimmed_fastq1,
end_trimmed_fastq=rtrimmed_fastq1)
trimmed_sequences2 = cut_primer_dimer_sequences(
original_fastq2,
start_trimmed_fastq=rtrimmed_fastq2,
end_trimmed_fastq=ltrimmed_fastq2)
with trimmed_fastq1.open('w') as f1, trimmed_fastq2.open('w') as f2:
for seq1, seq2 in zip(trimmed_sequences1, trimmed_sequences2):
if len(seq1) > 0 and len(seq2) > 0:
SeqIO.write([seq1], f1, 'fastq')
SeqIO.write([seq2], f2, 'fastq')


def cut_primer_dimer_sequences(original_fastq: Path,
Expand Down
14 changes: 14 additions & 0 deletions micall/tests/test_trim_fastqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,16 @@ def test_trim(tmpdir):
# Trimmed to nothing
'',
# Trimmed to nothing
),
('one empty read',
'TGGAAATACCCACAAGTTAATGGTTTAACCTGTCTCTTATACACATCTCCGAGCCCACGAGACACTACCTGGAA',
# nCoV-2019_18_LEFT ][ rev(ADAPTER2) ][ garbage
'CGTTGGGAGTGAATTAGCCCTTCCACTGTCTCTTATACACATCTGACGCTGCCGACGAAGGTTCTCAGGA',
# rev(REF) ][ rev(ADAPTER1) ][ garbage
'',
# Trimmed to nothing
'',
# Trimmed to nothing, because mate was.
)
])
def test_cut_adapters(tmpdir: str,
Expand Down Expand Up @@ -487,6 +497,10 @@ def test_cut_adapters(tmpdir: str,


def build_fastq(read_sequence):
if not read_sequence:
# Eliminate reads that get trimmed to nothing.
return ''

# Write two reads in the file to test primer dimer caching.
expected_quality1 = 'A' * len(read_sequence)
expected_trimmed1 = f'''\
Expand Down

0 comments on commit c3cd591

Please sign in to comment.