-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Lazy records are variant records that are lazily-evaluated. Their fields are not necessarily valid, but the buffer is guaranteed to be record-like.
- Loading branch information
Showing
8 changed files
with
411 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
//! Lazily-evaluated VCF record. | ||
|
||
pub mod record; | ||
|
||
pub use self::record::Record; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
//! Lazily-evaluated VCF record and fields. | ||
|
||
mod bounds; | ||
mod genotypes; | ||
|
||
use self::bounds::Bounds; | ||
pub use self::genotypes::Genotypes; | ||
|
||
/// An immutable, lazily-evaluated VCF record. | ||
pub struct Record { | ||
pub(crate) buf: String, | ||
pub(crate) bounds: Bounds, | ||
} | ||
|
||
impl Record { | ||
/// Returns the chromosome. | ||
pub fn chromosome(&self) -> &str { | ||
&self.buf[self.bounds.chromosome_range()] | ||
} | ||
|
||
/// Returns the position. | ||
pub fn position(&self) -> &str { | ||
&self.buf[self.bounds.position_range()] | ||
} | ||
|
||
/// Returns the IDs. | ||
pub fn ids(&self) -> &str { | ||
&self.buf[self.bounds.ids_range()] | ||
} | ||
|
||
/// Returns the reference bases. | ||
pub fn reference_bases(&self) -> &str { | ||
&self.buf[self.bounds.reference_bases_range()] | ||
} | ||
|
||
/// Returns the alternate bases. | ||
pub fn alternate_bases(&self) -> &str { | ||
&self.buf[self.bounds.alternatve_bases_range()] | ||
} | ||
|
||
/// Returns the quality score. | ||
pub fn quality_score(&self) -> &str { | ||
&self.buf[self.bounds.quality_score_range()] | ||
} | ||
|
||
/// Returns the filters. | ||
pub fn filters(&self) -> &str { | ||
&self.buf[self.bounds.filters_range()] | ||
} | ||
|
||
/// Returns the info. | ||
pub fn info(&self) -> &str { | ||
&self.buf[self.bounds.info_range()] | ||
} | ||
|
||
/// Returns the genotypes. | ||
pub fn genotypes(&self) -> Genotypes<'_> { | ||
let buf = &self.buf[self.bounds.genotypes_range()]; | ||
Genotypes::new(buf) | ||
} | ||
} | ||
|
||
impl Default for Record { | ||
fn default() -> Self { | ||
let buf = String::from("sq01.A...."); | ||
|
||
let bounds = Bounds { | ||
chromosome_end: 3, | ||
position_end: 4, | ||
ids_end: 5, | ||
reference_bases_end: 6, | ||
alternate_bases_end: 7, | ||
quality_score_end: 8, | ||
filters_end: 9, | ||
info_end: 10, | ||
}; | ||
|
||
Self { buf, bounds } | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
use std::ops::{Range, RangeFrom}; | ||
|
||
pub struct Bounds { | ||
pub chromosome_end: usize, | ||
pub position_end: usize, | ||
pub ids_end: usize, | ||
pub reference_bases_end: usize, | ||
pub alternate_bases_end: usize, | ||
pub quality_score_end: usize, | ||
pub filters_end: usize, | ||
pub info_end: usize, | ||
} | ||
|
||
impl Bounds { | ||
pub fn chromosome_range(&self) -> Range<usize> { | ||
0..self.chromosome_end | ||
} | ||
|
||
pub fn position_range(&self) -> Range<usize> { | ||
self.chromosome_end..self.position_end | ||
} | ||
|
||
pub fn ids_range(&self) -> Range<usize> { | ||
self.position_end..self.ids_end | ||
} | ||
|
||
pub fn reference_bases_range(&self) -> Range<usize> { | ||
self.ids_end..self.reference_bases_end | ||
} | ||
|
||
pub fn alternatve_bases_range(&self) -> Range<usize> { | ||
self.reference_bases_end..self.alternate_bases_end | ||
} | ||
|
||
pub fn quality_score_range(&self) -> Range<usize> { | ||
self.alternate_bases_end..self.quality_score_end | ||
} | ||
|
||
pub fn filters_range(&self) -> Range<usize> { | ||
self.quality_score_end..self.filters_end | ||
} | ||
|
||
pub fn info_range(&self) -> Range<usize> { | ||
self.quality_score_end..self.filters_end | ||
} | ||
|
||
pub fn genotypes_range(&self) -> RangeFrom<usize> { | ||
self.info_end.. | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
mod sample; | ||
|
||
use std::{io, iter}; | ||
|
||
pub use self::sample::Sample; | ||
use crate::record::{FIELD_DELIMITER, MISSING_FIELD}; | ||
|
||
/// Raw VCF record genotypes. | ||
#[derive(Debug, Eq, PartialEq)] | ||
pub struct Genotypes<'a>(&'a str); | ||
|
||
impl<'a> Genotypes<'a> { | ||
pub(super) fn new(buf: &'a str) -> Self { | ||
Self(buf) | ||
} | ||
|
||
/// Returns whether there may be any genotypes. | ||
pub fn is_empty(&self) -> bool { | ||
let is_missing = self | ||
.0 | ||
.split(FIELD_DELIMITER) | ||
.next() | ||
.map(|s| s == MISSING_FIELD) | ||
.unwrap_or_default(); | ||
|
||
self.0.is_empty() || is_missing | ||
} | ||
|
||
/// Returns an iterator over keys. | ||
pub fn keys(&self) -> io::Result<Box<dyn Iterator<Item = &str> + '_>> { | ||
const DELIMITER: char = ':'; | ||
|
||
if self.is_empty() { | ||
return Ok(Box::new(iter::empty())); | ||
} | ||
|
||
let (raw_format, _) = self | ||
.0 | ||
.split_once(FIELD_DELIMITER) | ||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing field separator"))?; | ||
|
||
Ok(Box::new(raw_format.split(DELIMITER))) | ||
} | ||
|
||
/// Returns an iterator over samples. | ||
pub fn samples(&self) -> io::Result<Box<dyn Iterator<Item = Option<Sample<'_>>> + '_>> { | ||
if self.is_empty() { | ||
return Ok(Box::new(iter::empty())); | ||
} | ||
|
||
let (_, raw_samples) = self | ||
.0 | ||
.split_once(FIELD_DELIMITER) | ||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing field separator"))?; | ||
|
||
Ok(Box::new(raw_samples.split(FIELD_DELIMITER).map( | ||
|s| match s { | ||
"." => None, | ||
_ => Some(Sample::new(s)), | ||
}, | ||
))) | ||
} | ||
} | ||
|
||
impl<'a> AsRef<str> for Genotypes<'a> { | ||
fn as_ref(&self) -> &str { | ||
self.0 | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_is_empty() { | ||
assert!(Genotypes::new("").is_empty()); | ||
assert!(Genotypes::new(".\t.").is_empty()); | ||
assert!(!Genotypes::new("GT:GQ\t0|0:13").is_empty()); | ||
} | ||
|
||
#[test] | ||
fn test_keys() -> io::Result<()> { | ||
let genotypes = Genotypes::new(""); | ||
assert!(genotypes.keys()?.next().is_none()); | ||
|
||
let genotypes = Genotypes::new(".\t."); | ||
assert!(genotypes.keys()?.next().is_none()); | ||
|
||
let genotypes = Genotypes::new("GT:GQ\t0|0:13"); | ||
let actual: Vec<_> = genotypes.keys()?.collect(); | ||
let expected = ["GT", "GQ"]; | ||
assert_eq!(actual, expected); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_samples() -> io::Result<()> { | ||
let genotypes = Genotypes::new(""); | ||
assert!(genotypes.samples()?.next().is_none()); | ||
|
||
let genotypes = Genotypes::new(".\t."); | ||
assert!(genotypes.samples()?.next().is_none()); | ||
|
||
let genotypes = Genotypes::new("GT:GQ\t0|0:13\t."); | ||
let actual: Vec<_> = genotypes.samples()?.collect(); | ||
let expected = [Some(Sample::new("0|0:13")), None]; | ||
assert_eq!(actual, expected); | ||
|
||
Ok(()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
use std::iter; | ||
|
||
/// A raw VCF record genotypes sample. | ||
#[derive(Debug, Eq, PartialEq)] | ||
pub struct Sample<'a>(&'a str); | ||
|
||
impl<'a> Sample<'a> { | ||
pub(super) fn new(buf: &'a str) -> Self { | ||
Self(buf) | ||
} | ||
|
||
pub fn values(&self) -> Box<dyn Iterator<Item = Option<&str>> + '_> { | ||
const MISSING: &str = "."; | ||
const DELIMITER: char = ':'; | ||
|
||
if self.0 == MISSING { | ||
return Box::new(iter::empty()); | ||
} | ||
|
||
Box::new(self.0.split(DELIMITER).map(|s| match s { | ||
"." => None, | ||
_ => Some(s), | ||
})) | ||
} | ||
} | ||
|
||
impl<'a> AsRef<str> for Sample<'a> { | ||
fn as_ref(&self) -> &str { | ||
self.0 | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_values() { | ||
let sample = Sample::new("."); | ||
assert!(sample.values().next().is_none()); | ||
|
||
let sample = Sample::new("0|0:."); | ||
let actual: Vec<_> = sample.values().collect(); | ||
let expected = [Some("0|0"), None]; | ||
assert_eq!(actual, expected); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.