Skip to content

Commit

Permalink
vcf/lazy: Add a lazy record
Browse files Browse the repository at this point in the history
Lazy records are variant records that are lazily-evaluated. Their fields
are not necessarily valid, but the buffer is guaranteed to be
record-like.
  • Loading branch information
zaeleus committed Aug 16, 2023
1 parent e92696c commit 24cade2
Show file tree
Hide file tree
Showing 8 changed files with 411 additions and 1 deletion.
7 changes: 7 additions & 0 deletions noodles-vcf/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
* vcf/header/record/value/map/contig/builder: Add URL setter
(`Builder::set_url`).

* vcf/lazy: Add a lazy record (`lazy::Record`).

Lazy records are variant records that are lazily-evaluated. Their fields
are not necessarily valid, but the buffer is guaranteed to be record-like.

* vcf/reader: Add `Reader::read_lazy_record` to read lazy records.

* vcf/record/position: Implement `PartialEq<core::Position>` and
`PartialOrd<core::Position>` for `Position` ([#191]).

Expand Down
5 changes: 5 additions & 0 deletions noodles-vcf/src/lazy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//! Lazily-evaluated VCF record.

pub mod record;

pub use self::record::Record;
80 changes: 80 additions & 0 deletions noodles-vcf/src/lazy/record.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//! Lazily-evaluated VCF record and fields.

mod bounds;
mod genotypes;

use self::bounds::Bounds;
pub use self::genotypes::Genotypes;

/// An immutable, lazily-evaluated VCF record.
pub struct Record {
pub(crate) buf: String,
pub(crate) bounds: Bounds,
}

impl Record {
/// Returns the chromosome.
pub fn chromosome(&self) -> &str {
&self.buf[self.bounds.chromosome_range()]
}

/// Returns the position.
pub fn position(&self) -> &str {
&self.buf[self.bounds.position_range()]
}

/// Returns the IDs.
pub fn ids(&self) -> &str {
&self.buf[self.bounds.ids_range()]
}

/// Returns the reference bases.
pub fn reference_bases(&self) -> &str {
&self.buf[self.bounds.reference_bases_range()]
}

/// Returns the alternate bases.
pub fn alternate_bases(&self) -> &str {
&self.buf[self.bounds.alternatve_bases_range()]
}

/// Returns the quality score.
pub fn quality_score(&self) -> &str {
&self.buf[self.bounds.quality_score_range()]
}

/// Returns the filters.
pub fn filters(&self) -> &str {
&self.buf[self.bounds.filters_range()]
}

/// Returns the info.
pub fn info(&self) -> &str {
&self.buf[self.bounds.info_range()]
}

/// Returns the genotypes.
pub fn genotypes(&self) -> Genotypes<'_> {
let buf = &self.buf[self.bounds.genotypes_range()];
Genotypes::new(buf)
}
}

impl Default for Record {
fn default() -> Self {
let buf = String::from("sq01.A....");

let bounds = Bounds {
chromosome_end: 3,
position_end: 4,
ids_end: 5,
reference_bases_end: 6,
alternate_bases_end: 7,
quality_score_end: 8,
filters_end: 9,
info_end: 10,
};

Self { buf, bounds }
}
}
50 changes: 50 additions & 0 deletions noodles-vcf/src/lazy/record/bounds.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use std::ops::{Range, RangeFrom};

pub struct Bounds {
pub chromosome_end: usize,
pub position_end: usize,
pub ids_end: usize,
pub reference_bases_end: usize,
pub alternate_bases_end: usize,
pub quality_score_end: usize,
pub filters_end: usize,
pub info_end: usize,
}

impl Bounds {
pub fn chromosome_range(&self) -> Range<usize> {
0..self.chromosome_end
}

pub fn position_range(&self) -> Range<usize> {
self.chromosome_end..self.position_end
}

pub fn ids_range(&self) -> Range<usize> {
self.position_end..self.ids_end
}

pub fn reference_bases_range(&self) -> Range<usize> {
self.ids_end..self.reference_bases_end
}

pub fn alternatve_bases_range(&self) -> Range<usize> {
self.reference_bases_end..self.alternate_bases_end
}

pub fn quality_score_range(&self) -> Range<usize> {
self.alternate_bases_end..self.quality_score_end
}

pub fn filters_range(&self) -> Range<usize> {
self.quality_score_end..self.filters_end
}

pub fn info_range(&self) -> Range<usize> {
self.quality_score_end..self.filters_end
}

pub fn genotypes_range(&self) -> RangeFrom<usize> {
self.info_end..
}
}
113 changes: 113 additions & 0 deletions noodles-vcf/src/lazy/record/genotypes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
mod sample;

use std::{io, iter};

pub use self::sample::Sample;
use crate::record::{FIELD_DELIMITER, MISSING_FIELD};

/// Raw VCF record genotypes.
#[derive(Debug, Eq, PartialEq)]
pub struct Genotypes<'a>(&'a str);

impl<'a> Genotypes<'a> {
pub(super) fn new(buf: &'a str) -> Self {
Self(buf)
}

/// Returns whether there may be any genotypes.
pub fn is_empty(&self) -> bool {
let is_missing = self
.0
.split(FIELD_DELIMITER)
.next()
.map(|s| s == MISSING_FIELD)
.unwrap_or_default();

self.0.is_empty() || is_missing
}

/// Returns an iterator over keys.
pub fn keys(&self) -> io::Result<Box<dyn Iterator<Item = &str> + '_>> {
const DELIMITER: char = ':';

if self.is_empty() {
return Ok(Box::new(iter::empty()));
}

let (raw_format, _) = self
.0
.split_once(FIELD_DELIMITER)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing field separator"))?;

Ok(Box::new(raw_format.split(DELIMITER)))
}

/// Returns an iterator over samples.
pub fn samples(&self) -> io::Result<Box<dyn Iterator<Item = Option<Sample<'_>>> + '_>> {
if self.is_empty() {
return Ok(Box::new(iter::empty()));
}

let (_, raw_samples) = self
.0
.split_once(FIELD_DELIMITER)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing field separator"))?;

Ok(Box::new(raw_samples.split(FIELD_DELIMITER).map(
|s| match s {
"." => None,
_ => Some(Sample::new(s)),
},
)))
}
}

impl<'a> AsRef<str> for Genotypes<'a> {
fn as_ref(&self) -> &str {
self.0
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_is_empty() {
assert!(Genotypes::new("").is_empty());
assert!(Genotypes::new(".\t.").is_empty());
assert!(!Genotypes::new("GT:GQ\t0|0:13").is_empty());
}

#[test]
fn test_keys() -> io::Result<()> {
let genotypes = Genotypes::new("");
assert!(genotypes.keys()?.next().is_none());

let genotypes = Genotypes::new(".\t.");
assert!(genotypes.keys()?.next().is_none());

let genotypes = Genotypes::new("GT:GQ\t0|0:13");
let actual: Vec<_> = genotypes.keys()?.collect();
let expected = ["GT", "GQ"];
assert_eq!(actual, expected);

Ok(())
}

#[test]
fn test_samples() -> io::Result<()> {
let genotypes = Genotypes::new("");
assert!(genotypes.samples()?.next().is_none());

let genotypes = Genotypes::new(".\t.");
assert!(genotypes.samples()?.next().is_none());

let genotypes = Genotypes::new("GT:GQ\t0|0:13\t.");
let actual: Vec<_> = genotypes.samples()?.collect();
let expected = [Some(Sample::new("0|0:13")), None];
assert_eq!(actual, expected);

Ok(())
}
}
47 changes: 47 additions & 0 deletions noodles-vcf/src/lazy/record/genotypes/sample.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use std::iter;

/// A raw VCF record genotypes sample.
#[derive(Debug, Eq, PartialEq)]
pub struct Sample<'a>(&'a str);

impl<'a> Sample<'a> {
pub(super) fn new(buf: &'a str) -> Self {
Self(buf)
}

pub fn values(&self) -> Box<dyn Iterator<Item = Option<&str>> + '_> {
const MISSING: &str = ".";
const DELIMITER: char = ':';

if self.0 == MISSING {
return Box::new(iter::empty());
}

Box::new(self.0.split(DELIMITER).map(|s| match s {
"." => None,
_ => Some(s),
}))
}
}

impl<'a> AsRef<str> for Sample<'a> {
fn as_ref(&self) -> &str {
self.0
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_values() {
let sample = Sample::new(".");
assert!(sample.values().next().is_none());

let sample = Sample::new("0|0:.");
let actual: Vec<_> = sample.values().collect();
let expected = [Some("0|0"), None];
assert_eq!(actual, expected);
}
}
1 change: 1 addition & 0 deletions noodles-vcf/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ mod r#async;

pub mod header;
pub mod indexed_reader;
pub mod lazy;
pub mod reader;
pub mod record;
mod variant_reader;
Expand Down
Loading

0 comments on commit 24cade2

Please sign in to comment.