pola-rs · ritchie46 · Sep 10, 2024 · Sep 8, 2024 · Sep 8, 2024 · Sep 8, 2024
@@ -15,27 +15,17 @@ pub(crate) fn materialize_hive_partitions<D>(
     num_rows: usize,
 ) {
     if let Some(hive_columns) = hive_partition_columns {
-        let Some(first) = hive_columns.first() else {
-            return;
-        };
+        // Insert these hive columns in the order they are stored in the file.
+        for s in hive_columns {
+            let i = match df.get_columns().binary_search_by_key(
+                &reader_schema.index_of(s.name()).unwrap_or(usize::MAX),
+                |df_col| reader_schema.index_of(df_col.name()).unwrap_or(usize::MIN),
+            ) {
+                Ok(i) => i,
+                Err(i) => i,
+            };
 
-        if reader_schema.index_of(first.name()).is_some() {
-            // Insert these hive columns in the order they are stored in the file.
-            for s in hive_columns {
-                let i = match df.get_columns().binary_search_by_key(
-                    &reader_schema.index_of(s.name()).unwrap_or(usize::MAX),
-                    |s| reader_schema.index_of(s.name()).unwrap_or(usize::MIN),
-                ) {
-                    Ok(i) => i,
-                    Err(i) => i,
-                };
-
-                df.insert_column(i, s.new_from_index(0, num_rows)).unwrap();
-            }
-        } else {
-            for s in hive_columns {
-                unsafe { df.with_column_unchecked(s.new_from_index(0, num_rows)) };
-            }
+            df.insert_column(i, s.new_from_index(0, num_rows)).unwrap();
         }
     }
 }
@@ -1068,27 +1068,26 @@ pub(crate) fn maybe_init_projection_excluding_hive(
     // Update `with_columns` with a projection so that hive columns aren't loaded from the
     // file
     let hive_parts = hive_parts?;
-
     let hive_schema = hive_parts.schema();
 
-    let (first_hive_name, _) = hive_schema.get_at_index(0)?;
-
-    // TODO: Optimize this
-    let names = match reader_schema {
-        Either::Left(ref v) => v
-            .contains(first_hive_name.as_str())
-            .then(|| v.iter_names_cloned().collect::<Vec<_>>()),
-        Either::Right(ref v) => v
-            .contains(first_hive_name.as_str())
-            .then(|| v.iter_names_cloned().collect()),
-    };
-
-    let names = names?;
-
-    Some(
-        names
-            .into_iter()
-            .filter(|x| !hive_schema.contains(x))
-            .collect::<Arc<[_]>>(),
-    )
+    match &reader_schema {
+        Either::Left(reader_schema) => hive_schema
+            .iter_names()
+            .any(|x| reader_schema.contains(x))
+            .then(|| {
+                reader_schema
+                    .iter_names_cloned()
+                    .filter(|x| !hive_schema.contains(x))
+                    .collect::<Arc<[_]>>()
+            }),
+        Either::Right(reader_schema) => hive_schema
+            .iter_names()
+            .any(|x| reader_schema.contains(x))
+            .then(|| {
+                reader_schema
+                    .iter_names_cloned()
+                    .filter(|x| !hive_schema.contains(x))
+                    .collect::<Arc<[_]>>()
+            }),
+    }
 }
@@ -554,6 +554,42 @@ def assert_with_projections(lf: pl.LazyFrame, df: pl.DataFrame) -> None:
     )
     assert_with_projections(lf, rhs)
 
+    # partial cols in file
+    partial_path = tmp_path / "a=1/b=2/partial_data.bin"
+    df = pl.DataFrame(
+        {"x": 1, "b": 2, "y": 1},
+        schema={"x": pl.Int32, "b": pl.Int16, "y": pl.Int32},
+    )
+    write_func(df, partial_path)
+
+    rhs = rhs.select(
+        pl.col("x").cast(pl.Int32),
+        pl.col("b").cast(pl.Int16),
+        pl.col("y").cast(pl.Int32),
+        pl.col("a").cast(pl.Int64),
+    )
+
+    lf = scan_func(partial_path, hive_partitioning=True)  # type: ignore[call-arg]
+    assert_frame_equal(lf.collect(projection_pushdown=projection_pushdown), rhs)
+    assert_with_projections(lf, rhs)
+
+    lf = scan_func(  # type: ignore[call-arg]
+        partial_path,
+        hive_schema={"a": pl.String, "b": pl.String},
+        hive_partitioning=True,
+    )
+    rhs = rhs.select(
+        pl.col("x").cast(pl.Int32),
+        pl.col("b").cast(pl.String),
+        pl.col("y").cast(pl.Int32),
+        pl.col("a").cast(pl.String),
+    )
+    assert_frame_equal(
+        lf.collect(projection_pushdown=projection_pushdown),
+        rhs,
+    )
+    assert_with_projections(lf, rhs)
+
 
 @pytest.mark.write_disk
 def test_hive_partition_dates(tmp_path: Path) -> None: