From 24cade2b2990e1106587eacfe56f8583634ddc18 Mon Sep 17 00:00:00 2001 From: Michael Macias Date: Wed, 16 Aug 2023 15:32:15 -0500 Subject: [PATCH] vcf/lazy: Add a lazy record Lazy records are variant records that are lazily-evaluated. Their fields are not necessarily valid, but the buffer is guaranteed to be record-like. --- noodles-vcf/CHANGELOG.md | 7 ++ noodles-vcf/src/lazy.rs | 5 + noodles-vcf/src/lazy/record.rs | 80 +++++++++++++ noodles-vcf/src/lazy/record/bounds.rs | 50 ++++++++ noodles-vcf/src/lazy/record/genotypes.rs | 113 ++++++++++++++++++ .../src/lazy/record/genotypes/sample.rs | 47 ++++++++ noodles-vcf/src/lib.rs | 1 + noodles-vcf/src/reader.rs | 109 ++++++++++++++++- 8 files changed, 411 insertions(+), 1 deletion(-) create mode 100644 noodles-vcf/src/lazy.rs create mode 100644 noodles-vcf/src/lazy/record.rs create mode 100644 noodles-vcf/src/lazy/record/bounds.rs create mode 100644 noodles-vcf/src/lazy/record/genotypes.rs create mode 100644 noodles-vcf/src/lazy/record/genotypes/sample.rs diff --git a/noodles-vcf/CHANGELOG.md b/noodles-vcf/CHANGELOG.md index bf6181daf..8a887eb66 100644 --- a/noodles-vcf/CHANGELOG.md +++ b/noodles-vcf/CHANGELOG.md @@ -9,6 +9,13 @@ * vcf/header/record/value/map/contig/builder: Add URL setter (`Builder::set_url`). + * vcf/lazy: Add a lazy record (`lazy::Record`). + + Lazy records are variant records that are lazily-evaluated. Their fields + are not necessarily valid, but the buffer is guaranteed to be record-like. + + * vcf/reader: Add `Reader::read_lazy_record` to read lazy records. + * vcf/record/position: Implement `PartialEq` and `PartialOrd` for `Position` ([#191]). diff --git a/noodles-vcf/src/lazy.rs b/noodles-vcf/src/lazy.rs new file mode 100644 index 000000000..669e43d1a --- /dev/null +++ b/noodles-vcf/src/lazy.rs @@ -0,0 +1,5 @@ +//! Lazily-evaluated VCF record. + +pub mod record; + +pub use self::record::Record; diff --git a/noodles-vcf/src/lazy/record.rs b/noodles-vcf/src/lazy/record.rs new file mode 100644 index 000000000..5c56e97c7 --- /dev/null +++ b/noodles-vcf/src/lazy/record.rs @@ -0,0 +1,80 @@ +//! Lazily-evaluated VCF record and fields. + +mod bounds; +mod genotypes; + +use self::bounds::Bounds; +pub use self::genotypes::Genotypes; + +/// An immutable, lazily-evaluated VCF record. +pub struct Record { + pub(crate) buf: String, + pub(crate) bounds: Bounds, +} + +impl Record { + /// Returns the chromosome. + pub fn chromosome(&self) -> &str { + &self.buf[self.bounds.chromosome_range()] + } + + /// Returns the position. + pub fn position(&self) -> &str { + &self.buf[self.bounds.position_range()] + } + + /// Returns the IDs. + pub fn ids(&self) -> &str { + &self.buf[self.bounds.ids_range()] + } + + /// Returns the reference bases. + pub fn reference_bases(&self) -> &str { + &self.buf[self.bounds.reference_bases_range()] + } + + /// Returns the alternate bases. + pub fn alternate_bases(&self) -> &str { + &self.buf[self.bounds.alternatve_bases_range()] + } + + /// Returns the quality score. + pub fn quality_score(&self) -> &str { + &self.buf[self.bounds.quality_score_range()] + } + + /// Returns the filters. + pub fn filters(&self) -> &str { + &self.buf[self.bounds.filters_range()] + } + + /// Returns the info. + pub fn info(&self) -> &str { + &self.buf[self.bounds.info_range()] + } + + /// Returns the genotypes. + pub fn genotypes(&self) -> Genotypes<'_> { + let buf = &self.buf[self.bounds.genotypes_range()]; + Genotypes::new(buf) + } +} + +impl Default for Record { + fn default() -> Self { + let buf = String::from("sq01.A...."); + + let bounds = Bounds { + chromosome_end: 3, + position_end: 4, + ids_end: 5, + reference_bases_end: 6, + alternate_bases_end: 7, + quality_score_end: 8, + filters_end: 9, + info_end: 10, + }; + + Self { buf, bounds } + } +} diff --git a/noodles-vcf/src/lazy/record/bounds.rs b/noodles-vcf/src/lazy/record/bounds.rs new file mode 100644 index 000000000..26290fa38 --- /dev/null +++ b/noodles-vcf/src/lazy/record/bounds.rs @@ -0,0 +1,50 @@ +use std::ops::{Range, RangeFrom}; + +pub struct Bounds { + pub chromosome_end: usize, + pub position_end: usize, + pub ids_end: usize, + pub reference_bases_end: usize, + pub alternate_bases_end: usize, + pub quality_score_end: usize, + pub filters_end: usize, + pub info_end: usize, +} + +impl Bounds { + pub fn chromosome_range(&self) -> Range { + 0..self.chromosome_end + } + + pub fn position_range(&self) -> Range { + self.chromosome_end..self.position_end + } + + pub fn ids_range(&self) -> Range { + self.position_end..self.ids_end + } + + pub fn reference_bases_range(&self) -> Range { + self.ids_end..self.reference_bases_end + } + + pub fn alternatve_bases_range(&self) -> Range { + self.reference_bases_end..self.alternate_bases_end + } + + pub fn quality_score_range(&self) -> Range { + self.alternate_bases_end..self.quality_score_end + } + + pub fn filters_range(&self) -> Range { + self.quality_score_end..self.filters_end + } + + pub fn info_range(&self) -> Range { + self.quality_score_end..self.filters_end + } + + pub fn genotypes_range(&self) -> RangeFrom { + self.info_end.. + } +} diff --git a/noodles-vcf/src/lazy/record/genotypes.rs b/noodles-vcf/src/lazy/record/genotypes.rs new file mode 100644 index 000000000..9cc092982 --- /dev/null +++ b/noodles-vcf/src/lazy/record/genotypes.rs @@ -0,0 +1,113 @@ +mod sample; + +use std::{io, iter}; + +pub use self::sample::Sample; +use crate::record::{FIELD_DELIMITER, MISSING_FIELD}; + +/// Raw VCF record genotypes. +#[derive(Debug, Eq, PartialEq)] +pub struct Genotypes<'a>(&'a str); + +impl<'a> Genotypes<'a> { + pub(super) fn new(buf: &'a str) -> Self { + Self(buf) + } + + /// Returns whether there may be any genotypes. + pub fn is_empty(&self) -> bool { + let is_missing = self + .0 + .split(FIELD_DELIMITER) + .next() + .map(|s| s == MISSING_FIELD) + .unwrap_or_default(); + + self.0.is_empty() || is_missing + } + + /// Returns an iterator over keys. + pub fn keys(&self) -> io::Result + '_>> { + const DELIMITER: char = ':'; + + if self.is_empty() { + return Ok(Box::new(iter::empty())); + } + + let (raw_format, _) = self + .0 + .split_once(FIELD_DELIMITER) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing field separator"))?; + + Ok(Box::new(raw_format.split(DELIMITER))) + } + + /// Returns an iterator over samples. + pub fn samples(&self) -> io::Result>> + '_>> { + if self.is_empty() { + return Ok(Box::new(iter::empty())); + } + + let (_, raw_samples) = self + .0 + .split_once(FIELD_DELIMITER) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing field separator"))?; + + Ok(Box::new(raw_samples.split(FIELD_DELIMITER).map( + |s| match s { + "." => None, + _ => Some(Sample::new(s)), + }, + ))) + } +} + +impl<'a> AsRef for Genotypes<'a> { + fn as_ref(&self) -> &str { + self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_empty() { + assert!(Genotypes::new("").is_empty()); + assert!(Genotypes::new(".\t.").is_empty()); + assert!(!Genotypes::new("GT:GQ\t0|0:13").is_empty()); + } + + #[test] + fn test_keys() -> io::Result<()> { + let genotypes = Genotypes::new(""); + assert!(genotypes.keys()?.next().is_none()); + + let genotypes = Genotypes::new(".\t."); + assert!(genotypes.keys()?.next().is_none()); + + let genotypes = Genotypes::new("GT:GQ\t0|0:13"); + let actual: Vec<_> = genotypes.keys()?.collect(); + let expected = ["GT", "GQ"]; + assert_eq!(actual, expected); + + Ok(()) + } + + #[test] + fn test_samples() -> io::Result<()> { + let genotypes = Genotypes::new(""); + assert!(genotypes.samples()?.next().is_none()); + + let genotypes = Genotypes::new(".\t."); + assert!(genotypes.samples()?.next().is_none()); + + let genotypes = Genotypes::new("GT:GQ\t0|0:13\t."); + let actual: Vec<_> = genotypes.samples()?.collect(); + let expected = [Some(Sample::new("0|0:13")), None]; + assert_eq!(actual, expected); + + Ok(()) + } +} diff --git a/noodles-vcf/src/lazy/record/genotypes/sample.rs b/noodles-vcf/src/lazy/record/genotypes/sample.rs new file mode 100644 index 000000000..ca31eac83 --- /dev/null +++ b/noodles-vcf/src/lazy/record/genotypes/sample.rs @@ -0,0 +1,47 @@ +use std::iter; + +/// A raw VCF record genotypes sample. +#[derive(Debug, Eq, PartialEq)] +pub struct Sample<'a>(&'a str); + +impl<'a> Sample<'a> { + pub(super) fn new(buf: &'a str) -> Self { + Self(buf) + } + + pub fn values(&self) -> Box> + '_> { + const MISSING: &str = "."; + const DELIMITER: char = ':'; + + if self.0 == MISSING { + return Box::new(iter::empty()); + } + + Box::new(self.0.split(DELIMITER).map(|s| match s { + "." => None, + _ => Some(s), + })) + } +} + +impl<'a> AsRef for Sample<'a> { + fn as_ref(&self) -> &str { + self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_values() { + let sample = Sample::new("."); + assert!(sample.values().next().is_none()); + + let sample = Sample::new("0|0:."); + let actual: Vec<_> = sample.values().collect(); + let expected = [Some("0|0"), None]; + assert_eq!(actual, expected); + } +} diff --git a/noodles-vcf/src/lib.rs b/noodles-vcf/src/lib.rs index b7b8c9279..31a415212 100644 --- a/noodles-vcf/src/lib.rs +++ b/noodles-vcf/src/lib.rs @@ -24,6 +24,7 @@ mod r#async; pub mod header; pub mod indexed_reader; +pub mod lazy; pub mod reader; pub mod record; mod variant_reader; diff --git a/noodles-vcf/src/reader.rs b/noodles-vcf/src/reader.rs index 092f6b870..a59b57348 100644 --- a/noodles-vcf/src/reader.rs +++ b/noodles-vcf/src/reader.rs @@ -6,10 +6,15 @@ pub(crate) mod query; pub mod record; mod records; +use crate::lazy; + pub(crate) use self::record::parse_record; pub use self::{builder::Builder, query::Query, records::Records}; -use std::io::{self, BufRead, Read, Seek}; +use std::{ + io::{self, BufRead, Read, Seek}, + str, +}; use noodles_bgzf as bgzf; use noodles_core::Region; @@ -209,6 +214,38 @@ where pub fn records<'r, 'h: 'r>(&'r mut self, header: &'h Header) -> Records<'r, 'h, R> { Records::new(self, header) } + + /// Reads a single record without eagerly parsing its fields. + /// + /// The reads VCF record fields from the underlying stream into the given record's buffer until + /// a newline is reached. No fields are parsed, meaning the record is no necessarily valid. + /// However, the structure of the line is guaranteed to be record-like. + /// + /// The stream is expected to be directly after the header or at the start of another record. + /// + /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the + /// stream reached EOF. + /// + /// # Examples + /// + /// ``` + /// use noodles_vcf as vcf; + /// + /// let data = b"##fileformat=VCFv4.3 + /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO + /// sq0\t1\t.\tA\t.\t.\tPASS\t. + /// "; + /// + /// let mut reader = vcf::Reader::new(&data[..]); + /// reader.read_header()?; + /// + /// let mut record = vcf::lazy::Record::default(); + /// reader.read_lazy_record(&mut record)?; + /// # Ok::<_, std::io::Error>(()) + /// ``` + pub fn read_lazy_record(&mut self, record: &mut lazy::Record) -> io::Result { + read_lazy_record(&mut self.inner, record) + } } impl Reader> @@ -352,6 +389,76 @@ where } } +fn read_lazy_record(reader: &mut R, record: &mut lazy::Record) -> io::Result +where + R: BufRead, +{ + let mut len = 0; + + len += read_field(reader, &mut record.buf)?; + record.bounds.chromosome_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.position_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.ids_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.reference_bases_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.alternate_bases_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.quality_score_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.filters_end = record.buf.len(); + + len += read_field(reader, &mut record.buf)?; + record.bounds.info_end = record.buf.len(); + + len += read_line(reader, &mut record.buf)?; + + Ok(len) +} + +fn read_field(reader: &mut R, dst: &mut String) -> io::Result +where + R: BufRead, +{ + const DELIMITER: u8 = b'\t'; + + let mut is_delimiter = false; + let mut len = 0; + + loop { + let src = reader.fill_buf()?; + + if is_delimiter || src.is_empty() { + break; + } + + let (buf, n) = match src.iter().position(|&b| b == DELIMITER) { + Some(i) => { + is_delimiter = true; + (&src[..i], i + 1) + } + None => (src, src.len()), + }; + + let s = str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + dst.push_str(s); + + len += n; + + reader.consume(n); + } + + Ok(len) +} + pub(crate) fn resolve_region(index: &csi::Index, region: &Region) -> io::Result<(usize, String)> { let header = index .header()