From c3cd591be4e9ef0ad98f96797a230639e52a5b5d Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Wed, 17 Jun 2020 10:21:18 -0700 Subject: [PATCH] Exclude reads that have been trimmed to nothing, as part of #530. Also upgrade to Python 3.8. --- Dockerfile | 3 +- Singularity | 8 ++--- micall/core/trim_fastqs.py | 52 ++++++++++++++++++-------------- micall/tests/test_trim_fastqs.py | 14 +++++++++ 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/Dockerfile b/Dockerfile index 34c3dba30..a1d595fb0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ # If you omit the `--target` tag altogether, `docker build` will build # the development image. -FROM python:3.7 as production +FROM python:3.8 as production MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall @@ -32,7 +32,6 @@ MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall RUN apt-get update -qq --fix-missing && apt-get install -qq -y \ unzip \ wget \ - libpython3.7-dev \ && rm -rf /var/lib/apt/lists/* RUN wget -qO rustup.sh https://sh.rustup.rs && \ diff --git a/Singularity b/Singularity index 6d5a10130..61b9a6003 100644 --- a/Singularity +++ b/Singularity @@ -62,15 +62,15 @@ From: centos:7 yum install -q -y unzip wget fontconfig bzip2-devel xz-devel openssl-devel libffi-devel echo ===== Installing Python ===== >/dev/null - wget -q https://www.python.org/ftp/python/3.7.6/Python-3.7.6.tgz - tar xzf Python* - rm Python*.tgz + wget -q https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tar.xz + tar xJf Python* + rm Python*.xz cd Python* ./configure --enable-optimizations make altinstall cd .. rm -rf Python* - ln -s /usr/local/bin/python3.7 /usr/local/bin/python3 + ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 echo ===== Installing Rust and merge-mates ===== >/dev/null yum install -q -y rust cargo diff --git a/micall/core/trim_fastqs.py b/micall/core/trim_fastqs.py index 661bdb8da..344c055c2 100755 --- a/micall/core/trim_fastqs.py +++ b/micall/core/trim_fastqs.py @@ -54,11 +54,11 @@ def parse_args(): return parser.parse_args() -def trim(original_fastq_filenames, - bad_cycles_filename, - trimmed_fastq_filenames, - use_gzip=True, - summary_file=None, +def trim(original_fastq_filenames: typing.Sequence[str], + bad_cycles_filename: str, + trimmed_fastq_filenames: typing.Sequence[str], + use_gzip: bool = True, + summary_file: typing.TextIO = None, skip: typing.Tuple[str] = ()): """ @@ -95,14 +95,14 @@ def trim(original_fastq_filenames, cycle_sign = 1 - 2*i with open(src_name, 'rb') as src, open(dest_name, 'w') as dest: censor(src, bad_cycles, dest, use_gzip, summary_writer, cycle_sign) - logger.info('Finished censoring in %s for %s.', - datetime.now() - start_time, - trimmed_fastq_filenames[0]) - - cut_all(censored_filenames[0], - censored_filenames[1], - trimmed_fastq_filenames[0], - trimmed_fastq_filenames[1], + logger.debug('Finished censoring in %s for %s.', + datetime.now() - start_time, + trimmed_fastq_filenames[0]) + + cut_all(Path(censored_filenames[0]), + Path(censored_filenames[1]), + Path(trimmed_fastq_filenames[0]), + Path(trimmed_fastq_filenames[1]), skip) purge_temp_files(censored_filenames) @@ -127,9 +127,9 @@ def cut_all(censored_fastq1: Path, censored_fastq2, dedapted_filenames[0], dedapted_filenames[1]) - logger.info('Trimmed adapters in %s for %s.', - datetime.now() - start_time, - trimmed_fastq1) + logger.debug('Trimmed adapters in %s for %s.', + datetime.now() - start_time, + trimmed_fastq1) if TrimSteps.primers in skip: shutil.copy(str(dedapted_filenames[0]), str(trimmed_fastq1)) shutil.copy(str(dedapted_filenames[1]), str(trimmed_fastq2)) @@ -246,13 +246,19 @@ def combine_primer_trimming(original_fastq1: Path, rtrimmed_fastq2: Path, trimmed_fastq1: Path, trimmed_fastq2: Path): - for original_fastq, start_trimmed_fastq, end_trimmed_fastq, trimmed_fastq in ( - (original_fastq1, ltrimmed_fastq1, rtrimmed_fastq1, trimmed_fastq1), - (original_fastq2, rtrimmed_fastq2, ltrimmed_fastq2, trimmed_fastq2)): - trimmed_sequences = cut_primer_dimer_sequences(original_fastq, - start_trimmed_fastq, - end_trimmed_fastq) - SeqIO.write(trimmed_sequences, trimmed_fastq, 'fastq') + trimmed_sequences1 = cut_primer_dimer_sequences( + original_fastq1, + start_trimmed_fastq=ltrimmed_fastq1, + end_trimmed_fastq=rtrimmed_fastq1) + trimmed_sequences2 = cut_primer_dimer_sequences( + original_fastq2, + start_trimmed_fastq=rtrimmed_fastq2, + end_trimmed_fastq=ltrimmed_fastq2) + with trimmed_fastq1.open('w') as f1, trimmed_fastq2.open('w') as f2: + for seq1, seq2 in zip(trimmed_sequences1, trimmed_sequences2): + if len(seq1) > 0 and len(seq2) > 0: + SeqIO.write([seq1], f1, 'fastq') + SeqIO.write([seq2], f2, 'fastq') def cut_primer_dimer_sequences(original_fastq: Path, diff --git a/micall/tests/test_trim_fastqs.py b/micall/tests/test_trim_fastqs.py index 7761d8f34..fe5743812 100644 --- a/micall/tests/test_trim_fastqs.py +++ b/micall/tests/test_trim_fastqs.py @@ -439,6 +439,16 @@ def test_trim(tmpdir): # Trimmed to nothing '', # Trimmed to nothing + ), + ('one empty read', + 'TGGAAATACCCACAAGTTAATGGTTTAACCTGTCTCTTATACACATCTCCGAGCCCACGAGACACTACCTGGAA', + # nCoV-2019_18_LEFT ][ rev(ADAPTER2) ][ garbage + 'CGTTGGGAGTGAATTAGCCCTTCCACTGTCTCTTATACACATCTGACGCTGCCGACGAAGGTTCTCAGGA', + # rev(REF) ][ rev(ADAPTER1) ][ garbage + '', + # Trimmed to nothing + '', + # Trimmed to nothing, because mate was. ) ]) def test_cut_adapters(tmpdir: str, @@ -487,6 +497,10 @@ def test_cut_adapters(tmpdir: str, def build_fastq(read_sequence): + if not read_sequence: + # Eliminate reads that get trimmed to nothing. + return '' + # Write two reads in the file to test primer dimer caching. expected_quality1 = 'A' * len(read_sequence) expected_trimmed1 = f'''\