diff --git a/polars/polars-core/src/schema.rs b/polars/polars-core/src/schema.rs index aae30a200273..aed70212943a 100644 --- a/polars/polars-core/src/schema.rs +++ b/polars/polars-core/src/schema.rs @@ -131,6 +131,10 @@ impl Schema { .ok_or_else(|| PolarsError::NotFound(name.to_string().into())) } + pub fn remove(&mut self, name: &str) -> Option { + self.inner.remove(name) + } + pub fn get_full(&self, name: &str) -> Option<(usize, &String, &DataType)> { self.inner.get_full(name) } @@ -170,8 +174,18 @@ impl Schema { Some(()) } - pub fn with_column(&mut self, name: String, dtype: DataType) { - self.inner.insert(name, dtype); + /// Insert a new column in the [`Schema`] + /// + /// If an equivalent name already exists in the schema: the name remains and + /// retains in its place in the order, its corresponding value is updated + /// with [`DataType`] and the older dtype is returned inside `Some(_)`. + /// + /// If no equivalent key existed in the map: the new name-dtype pair is + /// inserted, last in order, and `None` is returned. + /// + /// Computes in **O(1)** time (amortized average). + pub fn with_column(&mut self, name: String, dtype: DataType) -> Option { + self.inner.insert(name, dtype) } pub fn merge(&mut self, other: Self) { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs b/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs index f19e4a65c391..1c5cbf5a0669 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs @@ -378,7 +378,7 @@ impl LogicalPlanBuilder { for fld in other_schema.iter_fields() { if schema.get(fld.name()).is_none() { - schema.with_column(fld.name, fld.dtype) + schema.with_column(fld.name, fld.dtype); } } } @@ -550,7 +550,7 @@ impl LogicalPlanBuilder { if let Expr::Column(name) = e { if let Some(DataType::List(inner)) = schema.get(name) { let inner = *inner.clone(); - schema.with_column(name.to_string(), inner) + schema.with_column(name.to_string(), inner); } (**name).to_owned() diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs b/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs index 1491ae98c803..5024b722692f 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs @@ -107,7 +107,7 @@ impl FunctionNode { if let DataType::Struct(flds) = dtype { for fld in flds { new_schema - .with_column(fld.name().clone(), fld.data_type().clone()) + .with_column(fld.name().clone(), fld.data_type().clone()); } } else { return Err(PolarsError::ComputeError( @@ -115,7 +115,7 @@ impl FunctionNode { )); } } else { - new_schema.with_column(name.clone(), dtype.clone()) + new_schema.with_column(name.clone(), dtype.clone()); } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs index df0217cd14fa..f707b7934f03 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs @@ -134,7 +134,7 @@ fn update_scan_schema( new_cols.sort_unstable_by_key(|item| item.0); } for item in new_cols { - new_schema.with_column(item.1.clone(), item.2.clone()) + new_schema.with_column(item.1.clone(), item.2.clone()); } Ok(new_schema) } @@ -718,7 +718,7 @@ impl ProjectionPushDown { let other_schema = lp_arena.get(*node).schema(lp_arena); for fld in other_schema.iter_fields() { if new_schema.get(fld.name()).is_none() { - new_schema.with_column(fld.name, fld.dtype) + new_schema.with_column(fld.name, fld.dtype); } } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs b/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs index 9598921ded95..d1b83b7e8c9a 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs @@ -197,7 +197,7 @@ pub(crate) fn det_join_schema( for (name, dtype) in schema_left.iter() { names.insert(name.as_str()); - new_schema.with_column(name.to_string(), dtype.clone()) + new_schema.with_column(name.to_string(), dtype.clone()); } // make sure that expression are assigned to the schema diff --git a/polars/polars-lazy/src/frame/csv.rs b/polars/polars-lazy/src/frame/csv.rs index 52fc9a6f4bad..5788610b41b2 100644 --- a/polars/polars-lazy/src/frame/csv.rs +++ b/polars/polars-lazy/src/frame/csv.rs @@ -255,7 +255,7 @@ impl<'a> LazyCsvReader<'a> { // the dtypes set may be for the new names, so update again if let Some(overwrite_schema) = self.schema_overwrite { for (name, dtype) in overwrite_schema.iter() { - schema.with_column(name.clone(), dtype.clone()) + schema.with_column(name.clone(), dtype.clone()); } } diff --git a/polars/polars-lazy/src/frame/mod.rs b/polars/polars-lazy/src/frame/mod.rs index 22190c170716..ba3299ce8011 100644 --- a/polars/polars-lazy/src/frame/mod.rs +++ b/polars/polars-lazy/src/frame/mod.rs @@ -278,7 +278,9 @@ impl LazyFrame { // schema after renaming for (old, new) in existing2.iter().zip(new2.iter()) { let dtype = old_schema.try_get(old)?; - new_schema.with_column(new.clone(), dtype.clone()); + if new_schema.with_column(new.clone(), dtype.clone()).is_none() { + new_schema.remove(old); + } } Ok(Arc::new(new_schema)) }; @@ -297,7 +299,12 @@ impl LazyFrame { let columns = std::mem::take(df.get_columns_mut()); DataFrame::new(columns) }, - None, + // Don't allow optimizations. Swapping names are opaque to the optimizer + AllowedOptimizations { + projection_pushdown: false, + predicate_pushdown: false, + ..Default::default() + }, Some(Arc::new(udf_schema)), Some("RENAME_SWAPPING"), ) @@ -343,7 +350,7 @@ impl LazyFrame { cols.truncate(cols.len() - (existing.len() - removed_count)); Ok(df) }, - None, + Default::default(), Some(Arc::new(udf_schema)), Some("RENAME"), ) @@ -1118,7 +1125,7 @@ impl LazyFrame { pub fn map( self, function: F, - optimizations: Option, + optimizations: AllowedOptimizations, schema: Option>, name: Option<&'static str>, ) -> LazyFrame @@ -1130,7 +1137,7 @@ impl LazyFrame { .get_plan_builder() .map( function, - optimizations.unwrap_or_default(), + optimizations, schema, name.unwrap_or("ANONYMOUS UDF"), ) @@ -1206,7 +1213,7 @@ impl LazyFrame { Ok(df) } }, - Some(opt), + opt, Some(Arc::new(udf_schema)), Some("WITH ROW COUNT"), ) diff --git a/polars/polars-lazy/src/tests/predicate_queries.rs b/polars/polars-lazy/src/tests/predicate_queries.rs index 6f4450f07421..5f774d030aab 100644 --- a/polars/polars-lazy/src/tests/predicate_queries.rs +++ b/polars/polars-lazy/src/tests/predicate_queries.rs @@ -107,7 +107,7 @@ fn filter_blocked_by_map() -> PolarsResult<()> { }; let q = df .lazy() - .map(|df| Ok(df), Some(allowed), None, None) + .map(|df| Ok(df), allowed, None, None) .filter(col("A").gt(lit(2i32))); assert!(!predicate_at_scan(q.clone())); diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py index 62bd75c508a9..ae1898dac461 100644 --- a/py-polars/polars/internals/lazyframe/frame.py +++ b/py-polars/polars/internals/lazyframe/frame.py @@ -2736,6 +2736,11 @@ def rename(self: LDF, mapping: dict[str, str]) -> LDF: mapping Key value pairs that map from old name to new name. + Notes + ----- + If names are swapped. E.g. 'A' points to 'B' and 'B' points to 'A', polars + will block projection and predicate pushdowns at this node. + Examples -------- >>> df = pl.DataFrame( diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 1eecb55f1932..27f8acaffe43 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -77,7 +77,7 @@ impl PyDataFrame { *dtype_ = dtype; } } else { - schema.with_column(name, dtype) + schema.with_column(name, dtype); } } } diff --git a/py-polars/src/lazy/dataframe.rs b/py-polars/src/lazy/dataframe.rs index 2cc99e71129a..3062cea50935 100644 --- a/py-polars/src/lazy/dataframe.rs +++ b/py-polars/src/lazy/dataframe.rs @@ -879,7 +879,7 @@ impl PyLazyFrame { let udf_schema = schema.map(move |s| Arc::new(move |_: &Schema| Ok(s.clone())) as Arc); - ldf.map(function, Some(opt), udf_schema, None).into() + ldf.map(function, opt, udf_schema, None).into() } pub fn drop_columns(&self, cols: Vec) -> Self { diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index 15eeed8d02b5..77ce81090734 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -1915,6 +1915,41 @@ def test_rename_swap() -> None: ) assert out.frame_equal(expected) + # 6195 + ldf = pl.DataFrame( + { + "weekday": [ + 1, + ], + "priority": [ + 2, + ], + "roundNumber": [ + 3, + ], + "flag": [ + 4, + ], + } + ).lazy() + + # Rename some columns (note: swapping two columns) + rename_dict = { + "weekday": "priority", + "priority": "weekday", + "roundNumber": "round_number", + } + ldf = ldf.rename(rename_dict) + + # Select some columns + ldf = ldf.select(["priority", "weekday", "round_number"]) + + assert ldf.collect().to_dict(False) == { + "priority": [1], + "weekday": [2], + "round_number": [3], + } + def test_rename_same_name() -> None: df = pl.DataFrame(