feat: Add allow_missing_columns option to read/scan_parquet (#18922)

pola-rs · Sep 27, 2024 · 13e9717 · 13e9717
1 parent 79fcd53
commit 13e9717
Show file tree

Hide file tree

Showing 23 changed files with 256 additions and 59 deletions.
diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs
@@ -178,12 +178,15 @@ async fn download_projection(
     let mut offsets = Vec::with_capacity(fields.len());
     fields.iter().for_each(|name| {
         // A single column can have multiple matches (structs).
-        let iter = row_group.columns_under_root_iter(name).map(|meta| {
-            let byte_range = meta.byte_range();
-            let offset = byte_range.start;
-            let byte_range = byte_range.start as usize..byte_range.end as usize;
-            (offset, byte_range)
-        });
+        let iter = row_group
+            .columns_under_root_iter(name)
+            .unwrap()
+            .map(|meta| {
+                let byte_range = meta.byte_range();
+                let offset = byte_range.start;
+                let byte_range = byte_range.start as usize..byte_range.end as usize;
+                (offset, byte_range)
+            });
 
         for (offset, range) in iter {
             offsets.push(offset);

diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs
@@ -24,7 +24,7 @@ pub(crate) fn collect_statistics(
     let stats = schema
         .iter_values()
         .map(|field| {
-            let iter = md.columns_under_root_iter(&field.name);
+            let iter = md.columns_under_root_iter(&field.name).unwrap();
 
             Ok(if iter.len() == 0 {
                 ColumnStats::new(field.into(), None, None, None)

diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs
@@ -326,12 +326,19 @@ fn rg_to_dfs_prefiltered(
                     .map(|i| {
                         let col_idx = live_idx_to_col_idx[i];
 
-                        let name = schema.get_at_index(col_idx).unwrap().0;
-                        let field_md = file_metadata.row_groups[rg_idx]
-                            .columns_under_root_iter(name)
-                            .collect::<Vec<_>>();
+                        let (name, field) = schema.get_at_index(col_idx).unwrap();
+
+                        let Some(iter) = md.columns_under_root_iter(name) else {
+                            return Ok(Column::full_null(
+                                name.clone(),
+                                md.num_rows(),
+                                &DataType::from_arrow(&field.dtype, true),
+                            ));
+                        };
+
+                        let part = iter.collect::<Vec<_>>();
 
-                        column_idx_to_series(col_idx, field_md.as_slice(), None, schema, store)
+                        column_idx_to_series(col_idx, part.as_slice(), None, schema, store)
                             .map(Column::from)
                     })
                     .collect::<PolarsResult<Vec<_>>>()?;
@@ -384,20 +391,30 @@ fn rg_to_dfs_prefiltered(
                     .then(|| calc_prefilter_cost(&filter_mask))
                     .unwrap_or_default();
 
+                #[cfg(debug_assertions)]
+                {
+                    let md = &file_metadata.row_groups[rg_idx];
+                    debug_assert_eq!(md.num_rows(), mask.len());
+                }
+
+                let n_rows_in_result = filter_mask.set_bits();
+
                 let mut dead_columns = (0..num_dead_columns)
                     .into_par_iter()
                     .map(|i| {
                         let col_idx = dead_idx_to_col_idx[i];
-                        let name = schema.get_at_index(col_idx).unwrap().0;
 
-                        #[cfg(debug_assertions)]
-                        {
-                            let md = &file_metadata.row_groups[rg_idx];
-                            debug_assert_eq!(md.num_rows(), mask.len());
-                        }
-                        let field_md = file_metadata.row_groups[rg_idx]
-                            .columns_under_root_iter(name)
-                            .collect::<Vec<_>>();
+                        let (name, field) = schema.get_at_index(col_idx).unwrap();
+
+                        let Some(iter) = md.columns_under_root_iter(name) else {
+                            return Ok(Column::full_null(
+                                name.clone(),
+                                n_rows_in_result,
+                                &DataType::from_arrow(&field.dtype, true),
+                            ));
+                        };
+
+                        let field_md = iter.collect::<Vec<_>>();
 
                         let pre = || {
                             column_idx_to_series(
@@ -556,8 +573,17 @@ fn rg_to_dfs_optionally_par_over_columns(
                 projection
                     .par_iter()
                     .map(|column_i| {
-                        let name = schema.get_at_index(*column_i).unwrap().0;
-                        let part = md.columns_under_root_iter(name).collect::<Vec<_>>();
+                        let (name, field) = schema.get_at_index(*column_i).unwrap();
+
+                        let Some(iter) = md.columns_under_root_iter(name) else {
+                            return Ok(Column::full_null(
+                                name.clone(),
+                                rg_slice.1,
+                                &DataType::from_arrow(&field.dtype, true),
+                            ));
+                        };
+
+                        let part = iter.collect::<Vec<_>>();
 
                         column_idx_to_series(
                             *column_i,
@@ -574,8 +600,17 @@ fn rg_to_dfs_optionally_par_over_columns(
             projection
                 .iter()
                 .map(|column_i| {
-                    let name = schema.get_at_index(*column_i).unwrap().0;
-                    let part = md.columns_under_root_iter(name).collect::<Vec<_>>();
+                    let (name, field) = schema.get_at_index(*column_i).unwrap();
+
+                    let Some(iter) = md.columns_under_root_iter(name) else {
+                        return Ok(Column::full_null(
+                            name.clone(),
+                            rg_slice.1,
+                            &DataType::from_arrow(&field.dtype, true),
+                        ));
+                    };
+
+                    let part = iter.collect::<Vec<_>>();
 
                     column_idx_to_series(
                         *column_i,
@@ -672,12 +707,21 @@ fn rg_to_dfs_par_over_rg(
                 let columns = projection
                     .iter()
                     .map(|column_i| {
-                        let name = schema.get_at_index(*column_i).unwrap().0;
-                        let field_md = md.columns_under_root_iter(name).collect::<Vec<_>>();
+                        let (name, field) = schema.get_at_index(*column_i).unwrap();
+
+                        let Some(iter) = md.columns_under_root_iter(name) else {
+                            return Ok(Column::full_null(
+                                name.clone(),
+                                md.num_rows(),
+                                &DataType::from_arrow(&field.dtype, true),
+                            ));
+                        };
+
+                        let part = iter.collect::<Vec<_>>();
 
                         column_idx_to_series(
                             *column_i,
-                            field_md.as_slice(),
+                            part.as_slice(),
                             Some(Filter::new_ranged(slice.0, slice.0 + slice.1)),
                             schema,
                             store,

diff --git a/crates/polars-io/src/parquet/read/reader.rs b/crates/polars-io/src/parquet/read/reader.rs
@@ -85,9 +85,14 @@ impl<R: MmapBytesReader> ParquetReader<R> {
     /// dtype, and sets the projection indices.
     pub fn with_arrow_schema_projection(
         mut self,
-        first_schema: &ArrowSchema,
+        first_schema: &Arc<ArrowSchema>,
         projected_arrow_schema: Option<&ArrowSchema>,
+        allow_missing_columns: bool,
     ) -> PolarsResult<Self> {
+        if allow_missing_columns {
+            self.schema.replace(first_schema.clone());
+        }
+
         let schema = self.schema()?;
 
         if let Some(projected_arrow_schema) = projected_arrow_schema {
@@ -301,9 +306,14 @@ impl ParquetAsyncReader {
 
     pub async fn with_arrow_schema_projection(
         mut self,
-        first_schema: &ArrowSchema,
+        first_schema: &Arc<ArrowSchema>,
         projected_arrow_schema: Option<&ArrowSchema>,
+        allow_missing_columns: bool,
     ) -> PolarsResult<Self> {
+        if allow_missing_columns {
+            self.schema.replace(first_schema.clone());
+        }
+
         let schema = self.schema().await?;
 
         if let Some(projected_arrow_schema) = projected_arrow_schema {

diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
@@ -137,6 +137,7 @@ impl LazyFileListReader for LazyJsonLineReader {
             },
             glob: true,
             include_file_paths: self.include_file_paths,
+            allow_missing_columns: false,
         };
 
         let options = NDJsonReadOptions {

diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs
@@ -21,6 +21,7 @@ pub struct ScanArgsParquet {
     /// Expand path given via globbing rules.
     pub glob: bool,
     pub include_file_paths: Option<PlSmallStr>,
+    pub allow_missing_columns: bool,
 }
 
 impl Default for ScanArgsParquet {
@@ -37,6 +38,7 @@ impl Default for ScanArgsParquet {
             cache: true,
             glob: true,
             include_file_paths: None,
+            allow_missing_columns: false,
         }
     }
 }
@@ -74,6 +76,7 @@ impl LazyFileListReader for LazyParquetReader {
             self.args.hive_options,
             self.args.glob,
             self.args.include_file_paths,
+            self.args.allow_missing_columns,
         )?
         .build()
         .into();

diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -202,6 +202,8 @@ impl ParquetExec {
                 })
                 .collect::<Vec<_>>();
 
+            let allow_missing_columns = self.file_options.allow_missing_columns;
+
             let out = POOL.install(|| {
                 readers_and_metadata
                     .into_par_iter()
@@ -217,8 +219,9 @@ impl ParquetExec {
                             .with_row_index(row_index)
                             .with_predicate(predicate.clone())
                             .with_arrow_schema_projection(
-                                first_schema.as_ref(),
+                                &first_schema,
                                 projected_arrow_schema.as_deref(),
+                                allow_missing_columns,
                             )?
                             .finish()?;
 
@@ -395,6 +398,7 @@ impl ParquetExec {
             let first_schema = first_schema.clone();
             let projected_arrow_schema = projected_arrow_schema.clone();
             let predicate = predicate.clone();
+            let allow_missing_columns = self.file_options.allow_missing_columns;
 
             if verbose {
                 eprintln!("reading of {}/{} file...", processed, paths.len());
@@ -422,8 +426,9 @@ impl ParquetExec {
                             .with_slice(Some(slice))
                             .with_row_index(row_index)
                             .with_arrow_schema_projection(
-                                first_schema.as_ref(),
+                                &first_schema,
                                 projected_arrow_schema.as_deref(),
+                                allow_missing_columns,
                             )
                             .await?
                             .use_statistics(use_statistics)

diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils/filter.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils/filter.rs
@@ -29,7 +29,7 @@ impl Filter {
         }
     }
 
-    pub(crate) fn num_rows(&self) -> usize {
+    pub fn num_rows(&self) -> usize {
         match self {
             Filter::Range(range) => range.len(),
             Filter::Mask(bitmap) => bitmap.set_bits(),

diff --git a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs
@@ -49,16 +49,14 @@ impl RowGroupMetadata {
         self.columns.len()
     }
 
-    /// Fetch all columns under this root name.
+    /// Fetch all columns under this root name if it exists.
     pub fn columns_under_root_iter(
         &self,
         root_name: &str,
-    ) -> impl ExactSizeIterator<Item = &ColumnChunkMetadata> + DoubleEndedIterator {
+    ) -> Option<impl ExactSizeIterator<Item = &ColumnChunkMetadata> + DoubleEndedIterator> {
         self.column_lookup
             .get(root_name)
-            .unwrap()
-            .iter()
-            .map(|&x| &self.columns[x])
+            .map(|x| x.iter().map(|&x| &self.columns[x]))
     }
 
     /// Number of rows in this row group.

diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs
@@ -23,6 +23,7 @@ pub fn get_column_iterator<'a>(
 ) -> ColumnIterator<'a> {
     let columns = row_group
         .columns_under_root_iter(field_name)
+        .unwrap()
         .rev()
         .collect::<UnitVec<_>>();
     ColumnIterator::new(reader, columns, max_page_size)

diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs
@@ -134,6 +134,7 @@ impl ParquetSource {
                 .with_arrow_schema_projection(
                     &self.first_schema,
                     self.projected_arrow_schema.as_deref(),
+                    self.file_options.allow_missing_columns,
                 )?
                 .with_row_index(file_options.row_index)
                 .with_predicate(predicate.clone())
@@ -199,6 +200,7 @@ impl ParquetSource {
                     .with_arrow_schema_projection(
                         &self.first_schema,
                         self.projected_arrow_schema.as_deref(),
+                        self.file_options.allow_missing_columns,
                     )
                     .await?
                     .with_predicate(predicate.clone())

diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs
@@ -54,6 +54,7 @@ impl DslBuilder {
             },
             glob: false,
             include_file_paths: None,
+            allow_missing_columns: false,
         };
 
         Ok(DslPlan::Scan {
@@ -87,6 +88,7 @@ impl DslBuilder {
         hive_options: HiveOptions,
         glob: bool,
         include_file_paths: Option<PlSmallStr>,
+        allow_missing_columns: bool,
     ) -> PolarsResult<Self> {
         let options = FileScanOptions {
             with_columns: None,
@@ -98,6 +100,7 @@ impl DslBuilder {
             hive_options,
             glob,
             include_file_paths,
+            allow_missing_columns,
         };
         Ok(DslPlan::Scan {
             sources,
@@ -143,6 +146,7 @@ impl DslBuilder {
                 hive_options,
                 glob: true,
                 include_file_paths,
+                allow_missing_columns: false,
             },
             scan_type: FileScan::Ipc {
                 options,
@@ -181,6 +185,7 @@ impl DslBuilder {
             },
             glob,
             include_file_paths,
+            allow_missing_columns: false,
         };
         Ok(DslPlan::Scan {
             sources,

diff --git a/crates/polars-plan/src/plans/options.rs b/crates/polars-plan/src/plans/options.rs
@@ -39,6 +39,7 @@ pub struct FileScanOptions {
     pub hive_options: HiveOptions,
     pub glob: bool,
     pub include_file_paths: Option<PlSmallStr>,
+    pub allow_missing_columns: bool,
 }
 
 #[derive(Clone, Debug, Copy, Default, Eq, PartialEq, Hash)]

diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
@@ -240,7 +240,7 @@ impl PyLazyFrame {
     #[cfg(feature = "parquet")]
     #[staticmethod]
     #[pyo3(signature = (source, sources, n_rows, cache, parallel, rechunk, row_index,
-        low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths)
+        low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths, allow_missing_columns)
     )]
     fn new_from_parquet(
         source: Option<PyObject>,
@@ -259,6 +259,7 @@ impl PyLazyFrame {
         retries: usize,
         glob: bool,
         include_file_paths: Option<String>,
+        allow_missing_columns: bool,
     ) -> PyResult<Self> {
         let parallel = parallel.0;
         let hive_schema = hive_schema.map(|s| Arc::new(s.0));
@@ -287,6 +288,7 @@ impl PyLazyFrame {
             hive_options,
             glob,
             include_file_paths: include_file_paths.map(|x| x.into()),
+            allow_missing_columns,
         };
 
         let sources = sources.0;

diff --git a/crates/polars-python/src/lazyframe/visit.rs b/crates/polars-python/src/lazyframe/visit.rs
@@ -57,7 +57,7 @@ impl NodeTraverser {
     // Increment major on breaking changes to the IR (e.g. renaming
     // fields, reordering tuples), minor on backwards compatible
     // changes (e.g. exposing a new expression node).
-    const VERSION: Version = (2, 0);
+    const VERSION: Version = (2, 1);
 
     pub fn new(root: Node, lp_arena: Arena<IR>, expr_arena: Arena<AExpr>) -> Self {
         Self {