diff --git a/crates/polars-io/src/parquet/read/reader.rs b/crates/polars-io/src/parquet/read/reader.rs index 3cef93a89d73..eb3609c127ac 100644 --- a/crates/polars-io/src/parquet/read/reader.rs +++ b/crates/polars-io/src/parquet/read/reader.rs @@ -95,22 +95,38 @@ impl ParquetReader { let schema = self.schema()?; - if let Some(projected_arrow_schema) = projected_arrow_schema { - self.projection = projected_arrow_schema_to_projection_indices( - schema.as_ref(), - projected_arrow_schema, - )?; - } else { - if schema.len() > first_schema.len() { - polars_bail!( - SchemaMismatch: - "parquet file contained extra columns and no selection was given" - ) + (|| { + if let Some(projected_arrow_schema) = projected_arrow_schema { + self.projection = projected_arrow_schema_to_projection_indices( + schema.as_ref(), + projected_arrow_schema, + )?; + } else { + if schema.len() > first_schema.len() { + polars_bail!( + SchemaMismatch: + "parquet file contained extra columns and no selection was given" + ) + } + + self.projection = + projected_arrow_schema_to_projection_indices(schema.as_ref(), first_schema)?; + }; + Ok(()) + })() + .map_err(|e| { + if !allow_missing_columns && matches!(e, PolarsError::ColumnNotFound(_)) { + e.wrap_msg(|s| { + format!( + "error with column selection, \ + consider enabling `allow_missing_columns`: {}", + s + ) + }) + } else { + e } - - self.projection = - projected_arrow_schema_to_projection_indices(schema.as_ref(), first_schema)?; - } + })?; Ok(self) } @@ -316,22 +332,38 @@ impl ParquetAsyncReader { let schema = self.schema().await?; - if let Some(projected_arrow_schema) = projected_arrow_schema { - self.projection = projected_arrow_schema_to_projection_indices( - schema.as_ref(), - projected_arrow_schema, - )?; - } else { - if schema.len() > first_schema.len() { - polars_bail!( - SchemaMismatch: - "parquet file contained extra columns and no selection was given" - ) + (|| { + if let Some(projected_arrow_schema) = projected_arrow_schema { + self.projection = projected_arrow_schema_to_projection_indices( + schema.as_ref(), + projected_arrow_schema, + )?; + } else { + if schema.len() > first_schema.len() { + polars_bail!( + SchemaMismatch: + "parquet file contained extra columns and no selection was given" + ) + } + + self.projection = + projected_arrow_schema_to_projection_indices(schema.as_ref(), first_schema)?; + }; + Ok(()) + })() + .map_err(|e| { + if !allow_missing_columns && matches!(e, PolarsError::ColumnNotFound(_)) { + e.wrap_msg(|s| { + format!( + "error with column selection, \ + consider enabling `allow_missing_columns`: {}", + s + ) + }) + } else { + e } - - self.projection = - projected_arrow_schema_to_projection_indices(schema.as_ref(), first_schema)?; - } + })?; Ok(self) } diff --git a/crates/polars-io/src/parquet/read/utils.rs b/crates/polars-io/src/parquet/read/utils.rs index 62df261237eb..7ce183088aee 100644 --- a/crates/polars-io/src/parquet/read/utils.rs +++ b/crates/polars-io/src/parquet/read/utils.rs @@ -40,7 +40,7 @@ pub(super) fn projected_arrow_schema_to_projection_indices( for (i, field) in projected_arrow_schema.iter_values().enumerate() { let dtype = { let Some((idx, _, field)) = schema.get_full(&field.name) else { - polars_bail!(SchemaMismatch: "did not find column in file: {}", field.name) + polars_bail!(ColumnNotFound: "did not find column in file: {}", field.name) }; projection_indices.push(idx); diff --git a/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs b/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs index 3e4d03a3a270..61db45d54a0a 100644 --- a/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs +++ b/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs @@ -130,7 +130,7 @@ pub(super) fn ensure_schema_has_projected_fields( let expected_dtype = DataType::from_arrow(&field.dtype, true); let dtype = { let Some(field) = schema.get(&field.name) else { - polars_bail!(SchemaMismatch: "did not find column: {}", field.name) + polars_bail!(ColumnNotFound: "error with column selection, consider enabling `allow_missing_columns`: did not find column in file: {}", field.name) }; DataType::from_arrow(&field.dtype, true) }; diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 111289408dea..9604793c7baf 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -449,7 +449,7 @@ def test_parquet_schema_mismatch_panic_17067(tmp_path: Path, streaming: bool) -> pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).write_parquet(tmp_path / "1.parquet") pl.DataFrame({"c": [1, 2, 3], "d": [4, 5, 6]}).write_parquet(tmp_path / "2.parquet") - with pytest.raises(pl.exceptions.SchemaError): + with pytest.raises(pl.exceptions.ColumnNotFoundError): pl.scan_parquet(tmp_path).collect(streaming=streaming) @@ -642,5 +642,8 @@ def test_parquet_unaligned_schema_read_missing_cols_from_first( lf = pl.scan_parquet(paths) - with pytest.raises(pl.exceptions.SchemaError, match="did not find column"): + with pytest.raises( + pl.exceptions.ColumnNotFoundError, + match="did not find column in file: a", + ): lf.collect(streaming=streaming) diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 3f7e8b977505..8431c659cce0 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1932,10 +1932,16 @@ def test_allow_missing_columns( expected = pl.DataFrame({"a": [1, 2], "b": [1, None]}).select(projection) - with pytest.raises(pl.exceptions.SchemaError, match="did not find column"): + with pytest.raises( + pl.exceptions.ColumnNotFoundError, + match="error with column selection, consider enabling `allow_missing_columns`: did not find column in file: b", + ): pl.read_parquet(paths, parallel=parallel) # type: ignore[arg-type] - with pytest.raises(pl.exceptions.SchemaError, match="did not find column"): + with pytest.raises( + pl.exceptions.ColumnNotFoundError, + match="error with column selection, consider enabling `allow_missing_columns`: did not find column in file: b", + ): pl.scan_parquet(paths, parallel=parallel).select(projection).collect( # type: ignore[arg-type] streaming=streaming )