From c4ddad18c2037fe3455b2a73892ba4f4305fefae Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 12 Oct 2021 10:07:04 -0400 Subject: [PATCH] ARROW-14025: [R][C++] PreBuffer is not enabled when scanning parquet via exec nodes --- r/R/dataset-format.R | 2 +- r/src/dataset.cpp | 4 ++++ r/tests/testthat/test-dataset-csv.R | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index 983b6f614a720..2e1c673274261 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -291,7 +291,7 @@ CsvFragmentScanOptions$create <- function(..., ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions) ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE, buffer_size = 8196, - pre_buffer = FALSE) { + pre_buffer = TRUE) { dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer) } diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index b92e3ad427626..1fc87ec140075 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -335,6 +335,10 @@ dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buf } options->reader_properties->set_buffer_size(buffer_size); options->arrow_reader_properties->set_pre_buffer(pre_buffer); + if (pre_buffer) { + options->arrow_reader_properties->set_cache_options( + arrow::io::CacheOptions::LazyDefaults()); + } return options; } diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 0294128035b47..8d14085332bf5 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -108,6 +108,17 @@ test_that("CSV scan options", { chr = c("foo", NA), chr2 = c("bar", "baz") )) + expect_equal( + ds %>% + group_by(chr2) %>% + summarize(na = all(is.na(chr))) %>% + arrange(chr2) %>% + collect(), + tibble( + chr2 = c("bar", "baz"), + na = c(FALSE, TRUE) + ) + ) }) test_that("compressed CSV dataset", {