Skip to content

Commit

Permalink
fix: Lazy csv + projection; respect null values arg (#16077)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored May 6, 2024
1 parent 3600b16 commit 575e917
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 20 deletions.
11 changes: 0 additions & 11 deletions crates/polars-io/src/csv/read/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,17 +133,6 @@ pub(super) enum NullValuesCompiled {
}

impl NullValuesCompiled {
pub(super) fn apply_projection(&mut self, projections: &[usize]) {
if let Self::Columns(nv) = self {
let nv = projections
.iter()
.map(|i| std::mem::take(&mut nv[*i]))
.collect::<Vec<_>>();

*self = NullValuesCompiled::Columns(nv);
}
}

/// # Safety
///
/// The caller must ensure that `index` is in bounds
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-io/src/csv/read/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ pub(super) fn parse_lines(

// SAFETY:
// process fields is in bounds
add_null = unsafe { null_values.is_null(field, processed_fields) }
add_null = unsafe { null_values.is_null(field, idx as usize) }
}
if add_null {
buf.add_null(!missing_is_null && field.is_empty())
Expand Down
10 changes: 2 additions & 8 deletions crates/polars-io/src/csv/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,21 +224,15 @@ impl<'a> CoreReader<'a> {
}
}

// create a null value for every column
let mut null_values = null_values.map(|nv| nv.compile(&schema)).transpose()?;
// Create a null value for every column
let null_values = null_values.map(|nv| nv.compile(&schema)).transpose()?;

if let Some(cols) = columns {
let mut prj = Vec::with_capacity(cols.len());
for col in cols {
let i = schema.try_index_of(&col)?;
prj.push(i);
}

// update null values with projection
if let Some(nv) = null_values.as_mut() {
nv.apply_projection(&prj);
}

projection = Some(prj);
}

Expand Down
23 changes: 23 additions & 0 deletions py-polars/tests/unit/io/test_lazy_csv.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import tempfile
from collections import OrderedDict
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -285,3 +286,25 @@ def test_scan_empty_csv_with_row_index(tmp_path: Path) -> None:

read = pl.scan_csv(file_path).with_row_index("idx")
assert read.collect().schema == OrderedDict([("idx", pl.UInt32), ("a", pl.String)])


@pytest.mark.write_disk()
def test_csv_null_values_with_projection_15515() -> None:
data = """IndCode,SireCode,BirthDate,Flag
ID00316,.,19940315,
"""

with tempfile.NamedTemporaryFile() as f:
f.write(data.encode())
f.seek(0)

q = (
pl.scan_csv(f.name, null_values={"SireCode": "."})
.with_columns(pl.col("SireCode").alias("SireKey"))
.select("SireKey", "BirthDate")
)

assert q.collect().to_dict(as_series=False) == {
"SireKey": [None],
"BirthDate": [19940315],
}

0 comments on commit 575e917

Please sign in to comment.