From 6043010c9497f04868d4bed2fb2b92db7a9322ce Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 3 Feb 2021 18:21:25 -0500 Subject: [PATCH 1/4] fix parsing of priorities tsv file to allow spaces in sequence IDs When parsing priorities.tsv files, an error could be encountered if a sequence name contain spaces, as `.split()` breaks not only on the tabs delimiting the file, but any whitespace character. This repalces `.split()` with `.split('\t')` to address this and preserve sequence names containg spaces. --- augur/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/filter.py b/augur/filter.py index 7ef1d789e..5aa6cf431 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -60,7 +60,7 @@ def read_priority_scores(fname): with open(fname, encoding='utf-8') as pfile: return defaultdict(float, { elems[0]: float(elems[1]) - for elems in (line.strip().split() for line in pfile.readlines()) + for elems in (line.strip().split('\t') for line in pfile.readlines()) }) except Exception as e: print(f"ERROR: missing or malformed priority scores file {fname}", file=sys.stderr) From 9891eacbc397ce7550766e9e7be562f107ecf93a Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 11 Feb 2021 13:32:24 -0500 Subject: [PATCH 2/4] in filter.py::read_priority_scores(), only split by tab if a tab is present in the line in filter.py::read_priority_scores(), only split by tab if a tab is present in the line, to allow backward compatibility with space-delimited files --- augur/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/filter.py b/augur/filter.py index 5aa6cf431..ea0a9cf17 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -60,7 +60,7 @@ def read_priority_scores(fname): with open(fname, encoding='utf-8') as pfile: return defaultdict(float, { elems[0]: float(elems[1]) - for elems in (line.strip().split('\t') for line in pfile.readlines()) + for elems in (line.strip().split() if '\t' not in line else line.strip().split('\t') for line in pfile.readlines()) }) except Exception as e: print(f"ERROR: missing or malformed priority scores file {fname}", file=sys.stderr) From 8ce21a4a4a815a53fca4a07a93742886a277c2aa Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 11 Feb 2021 13:46:43 -0500 Subject: [PATCH 3/4] split on tabs if tabs are present as more common case --- augur/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/filter.py b/augur/filter.py index ea0a9cf17..463328ed8 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -60,7 +60,7 @@ def read_priority_scores(fname): with open(fname, encoding='utf-8') as pfile: return defaultdict(float, { elems[0]: float(elems[1]) - for elems in (line.strip().split() if '\t' not in line else line.strip().split('\t') for line in pfile.readlines()) + for elems in (line.strip().split('\t') if '\t' in line else line.strip().split() for line in pfile.readlines()) }) except Exception as e: print(f"ERROR: missing or malformed priority scores file {fname}", file=sys.stderr) From 190ae739cc614cad47a53860b09301084be79135 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 12 Feb 2021 00:49:39 -0500 Subject: [PATCH 4/4] add unit test for parsing tab-delimited priorities file add unit test for parsing tab-delimited priorities file, where column 1 has spaces in the values. Thanks to @huddlej for providing this. --- tests/test_filter.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_filter.py b/tests/test_filter.py index 3b6354474..1d4399bbb 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -57,6 +57,12 @@ def mock_run_shell_command(mocker): mocker.patch("augur.filter.run_shell_command") +@pytest.fixture +def mock_priorities_file_valid_with_spaces_and_tabs(mocker): + mocker.patch( + "builtins.open", mocker.mock_open(read_data="strain 1\t5\nstrain 2\t6\nstrain 3\t8\n") + ) + class TestFilter: def test_read_vcf_compressed(self): seq_keep, all_seq = augur.filter.read_vcf( @@ -89,6 +95,14 @@ def test_read_priority_scores_malformed(self, mock_priorities_file_malformed): # builtins.open is stubbed, but we need a valid file to satisfy the existence check augur.filter.read_priority_scores("tests/builds/tb/data/lee_2015.vcf") + def test_read_priority_scores_valid_with_spaces_and_tabs(self, mock_priorities_file_valid_with_spaces_and_tabs): + # builtins.open is stubbed, but we need a valid file to satisfy the existence check + priorities = augur.filter.read_priority_scores( + "tests/builds/tb/data/lee_2015.vcf" + ) + + assert priorities == {"strain 1": 5, "strain 2": 6, "strain 3": 8} + def test_read_priority_scores_does_not_exist(self): with pytest.raises(FileNotFoundError): augur.filter.read_priority_scores("/does/not/exist.txt")