From 44b6ded1340d8f3c1bfece18e871ed3be4c394d6 Mon Sep 17 00:00:00 2001 From: Eric Fredine Date: Thu, 10 Oct 2024 13:54:46 -0700 Subject: [PATCH] Adds documentation and example recommending Vec over ChunkedArray (#6527) * Adds documentation and example recommending Vec as an alternative to a ChunkedArray abstraction." * Remove link to example. * Reduce width of doc example * Move documentation to arrow-array. Simplify doc example. Remove top-level example. * Update arrow-array/src/lib.rs --------- Co-authored-by: Eric Fredine Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/lib.rs | 47 ++++++++++++++++++++++++++++++++++++++++ arrow/examples/README.md | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 90bc5e31205a..0fc9d30ab6e3 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -161,7 +161,52 @@ //! array.as_primitive::().values() //! } //! ``` +//! # Alternatives to ChunkedArray Support //! +//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python +//! and C++ Arrow implementations. The recommended alternative is to use one of the following: +//! - `Vec` a simple, eager version of a `ChunkedArray` +//! - `impl Iterator` a lazy version of a `ChunkedArray` +//! - `impl Stream` a lazy async version of a `ChunkedArray` +//! +//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes +//! extensive use of [RecordBatchStream]. +//! +//! This approach integrates well into the Rust ecosystem, simplifies the implementation and +//! encourages the use of performant lazy and async patterns. +//! ```rust +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray}; +//! use arrow_array::cast::AsArray; +//! use arrow_array::types::Float32Type; +//! use arrow_schema::DataType; +//! +//! let batches = [ +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef), +//! ]).unwrap(), +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef), +//! ]).unwrap(), +//! ]; +//! +//! let labels: Vec<&str> = batches +//! .iter() +//! .flat_map(|batch| batch.column(0).as_string::()) +//! .map(Option::unwrap) +//! .collect(); +//! +//! let values: Vec = batches +//! .iter() +//! .flat_map(|batch| batch.column(1).as_primitive::().values()) +//! .copied() +//! .collect(); +//! +//! assert_eq!(labels, ["A", "B", "C", "D", "E"]); +//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); +//!``` //! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer //! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer //! [`OffsetBuffer`]: arrow_buffer::OffsetBuffer @@ -173,6 +218,8 @@ //! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html //! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html +//! [DataFusion]: https://github.com/apache/arrow-datafusion +//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html #![deny(rustdoc::broken_intra_doc_links)] #![warn(missing_docs)] diff --git a/arrow/examples/README.md b/arrow/examples/README.md index 5c57ec00cd76..87aa6ee0475b 100644 --- a/arrow/examples/README.md +++ b/arrow/examples/README.md @@ -21,7 +21,7 @@ - [`builders.rs`](builders.rs): Using the Builder API - [`collect.rs`](collect.rs): Using the `FromIter` API -- [`dynamic_types.rs`](dynamic_types.rs): +- [`dynamic_types.rs`](dynamic_types.rs): Dealing with mixed types dynamically at runtime - [`read_csv.rs`](read_csv.rs): Reading CSV files with explicit schema, pretty printing Arrays - [`read_csv_infer_schema.rs`](read_csv_infer_schema.rs): Reading CSV files, pretty printing Arrays - [`tensor_builder.rs`](tensor_builder.rs): Using tensor builder