Skip to content

Commit

Permalink
perf: Use two-pass algorithm for csv to ensure correctness and SIMDiz…
Browse files Browse the repository at this point in the history
…e more `~17%` (pola-rs#19088)
  • Loading branch information
ritchie46 authored Oct 5, 2024
1 parent 60a6465 commit f7de80c
Show file tree
Hide file tree
Showing 13 changed files with 656 additions and 451 deletions.
11 changes: 1 addition & 10 deletions crates/polars-io/src/csv/read/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ pub struct CsvReadOptions {
// CSV-specific options
pub parse_options: Arc<CsvParseOptions>,
pub has_header: bool,
pub sample_size: usize,
pub chunk_size: usize,
pub skip_rows: usize,
pub skip_rows_after_header: usize,
Expand Down Expand Up @@ -60,7 +59,7 @@ impl Default for CsvReadOptions {
Self {
path: None,

rechunk: true,
rechunk: false,
n_threads: None,
low_memory: false,

Expand All @@ -75,7 +74,6 @@ impl Default for CsvReadOptions {

parse_options: Default::default(),
has_header: true,
sample_size: 1024,
chunk_size: 1 << 18,
skip_rows: 0,
skip_rows_after_header: 0,
Expand Down Expand Up @@ -193,13 +191,6 @@ impl CsvReadOptions {
self
}

/// Sets the number of rows sampled from the file to determine approximately
/// how much memory to use for the initial allocation.
pub fn with_sample_size(mut self, sample_size: usize) -> Self {
self.sample_size = sample_size;
self
}

/// Sets the chunk size used by the parser. This influences performance.
pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
self.chunk_size = chunk_size;
Expand Down
Loading

0 comments on commit f7de80c

Please sign in to comment.