Skip to content

Commit

Permalink
BED file coordinates should be treated as 0-start, half-open intervals
Browse files Browse the repository at this point in the history
In addition, tests have been added to confirm it works as expected.

Fixes nextstrain#511
  • Loading branch information
Julien BORDELLIER committed Apr 5, 2020
1 parent e03e1da commit 04dfdee
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 1 deletion.
3 changes: 2 additions & 1 deletion augur/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def load_excluded_sites(excluded_sites_file):
is_bed_format = True
bed = pd.read_csv(excluded_sites_file, sep='\t')
for index, row in bed.iterrows():
strip_pos.extend(list(range(row[1], row[2]+1)))
strip_pos.extend(list(range(row[1], row[2])))
else:
# Next, check for DRM-file format or site-per-line format.
with open(excluded_sites_file, 'r') as ifile:
Expand All @@ -246,6 +246,7 @@ def load_excluded_sites(excluded_sites_file):

# If the given sites are not in BED format, they are one-based positions and
# need to be internally adjusted to zero-based positions.
# http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/
if not is_bed_format:
strip_pos = strip_pos - 1

Expand Down
62 changes: 62 additions & 0 deletions tests/test_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Unit tests for building a tree
"""
import sys
from pathlib import Path
import pytest
import Bio
import pandas as pd

# we assume (and assert) that this script is running from the tests/ directory
sys.path.append(str(Path(__file__).parent.parent.parent))

from augur import tree

@pytest.fixture
def alignment():
return Bio.AlignIO.read("tests/data/aa-seq_h3n2_ha_2y_HA1.fasta", "fasta")

@pytest.fixture
def exclude_sites_bed(tmpdir):
df = pd.DataFrame([
[42, 45],
[40, 41],
[158367030, 158367031],
[40, 41],
])
filename = str(tmpdir / "exclude_sites.bed")
df.to_csv(filename, sep='\t')
return filename

@pytest.fixture
def exclude_sites_txt(tmpdir, mocker):
filename = str(tmpdir / "exclude_sites.txt")
with open(filename, "w") as f:
f.write("618\n")
f.write("617\n")
f.write("617\n")
return filename

@pytest.fixture
def exclude_sites_drm(tmpdir):
filename = str(tmpdir / "exclude_sites.drm")
with open(filename, "w") as f:
f.write("site\tvalue\n")
f.write("site1\t618\n")
f.write("site2\t617\n")
f.write("site3\t618\n")
return filename

def test_load_excluded_sites_empty_file():
assert tree.load_excluded_sites(None).tolist() == []

def test_load_excluded_sites_bed(exclude_sites_bed):
# load_excluded_sites should treat BED files as 0-start,half-open intervals
# http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/
assert tree.load_excluded_sites(exclude_sites_bed).tolist() == [40, 42, 43, 44, 158367030]

def test_load_excluded_sites_txt(exclude_sites_txt):
assert tree.load_excluded_sites(exclude_sites_txt).tolist() == [616, 617]

def test_load_excluded_sites_drm(exclude_sites_drm):
assert tree.load_excluded_sites(exclude_sites_drm).tolist() == [616, 617]

0 comments on commit 04dfdee

Please sign in to comment.