diff --git a/Cargo.lock b/Cargo.lock index 92664f2cf0d9..0de39dc632cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2864,6 +2864,7 @@ dependencies = [ "serde", "serde_json", "smartstring", + "unicode-reverse", "version_check", ] @@ -4276,6 +4277,21 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-reverse" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bea5dacebb0d2d0a69a6700a05b59b3908bf801bf563a49bd27a1b60122962c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.11" diff --git a/Cargo.toml b/Cargo.toml index 74bb4659e092..05ca60d3aab6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,7 @@ strum_macros = "0.25" thiserror = "1" tokio = "1.26" tokio-util = "0.7.8" +unicode-reverse = "1.0.8" url = "2.4" version_check = "0.9.4" xxhash-rust = { version = "0.8.6", features = ["xxh3"] } diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 6c2cf6dae55f..91bf184a72e4 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -123,6 +123,7 @@ list_to_struct = ["polars-plan/list_to_struct"] python = ["pyo3", "polars-plan/python", "polars-core/python", "polars-io/python"] row_hash = ["polars-plan/row_hash"] string_pad = ["polars-plan/string_pad"] +string_reverse = ["polars-plan/string_reverse"] string_to_integer = ["polars-plan/string_to_integer"] arg_where = ["polars-plan/arg_where"] search_sorted = ["polars-plan/search_sorted"] @@ -251,6 +252,7 @@ features = [ "top_k", "approx_unique", "concat_str", + "string_reverse", "string_to_integer", "cse", "dot_diagram", diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 4b8d620fae8c..1be1ecbb045c 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -35,6 +35,7 @@ regex = { workspace = true } serde = { workspace = true, features = ["derive"], optional = true } serde_json = { workspace = true, optional = true } smartstring = { workspace = true } +unicode-reverse = { workspace = true, optional = true } [dev-dependencies] rand = { workspace = true } @@ -87,6 +88,7 @@ diff = [] pct_change = ["diff"] strings = ["polars-core/strings"] string_pad = ["polars-core/strings"] +string_reverse = ["polars-core/strings", "unicode-reverse"] string_to_integer = ["polars-core/strings"] extract_jsonpath = ["serde_json", "jsonpath_lib", "polars-json"] log = [] diff --git a/crates/polars-ops/src/chunked_array/strings/mod.rs b/crates/polars-ops/src/chunked_array/strings/mod.rs index fe1d4c459681..48a224c1575a 100644 --- a/crates/polars-ops/src/chunked_array/strings/mod.rs +++ b/crates/polars-ops/src/chunked_array/strings/mod.rs @@ -33,6 +33,8 @@ use polars_core::prelude::*; pub use split::*; #[cfg(feature = "strings")] pub use strip::*; +#[cfg(feature = "string_reverse")] +mod reverse; pub trait AsUtf8 { fn as_utf8(&self) -> &Utf8Chunked; diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 49ee0a980ee3..009660bc92cb 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -520,6 +520,14 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { ca + other } + /// Reverses the string values + #[must_use] + #[cfg(feature = "string_reverse")] + fn str_reverse(&self) -> Utf8Chunked { + let ca = self.as_utf8(); + reverse::reverse(ca) + } + /// Slice the string values. /// /// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`. diff --git a/crates/polars-ops/src/chunked_array/strings/reverse.rs b/crates/polars-ops/src/chunked_array/strings/reverse.rs new file mode 100644 index 000000000000..0dba38611cbf --- /dev/null +++ b/crates/polars-ops/src/chunked_array/strings/reverse.rs @@ -0,0 +1,14 @@ +use polars_core::prelude::Utf8Chunked; +use unicode_reverse::reverse_grapheme_clusters_in_place; + +fn to_reverse_helper(s: Option<&str>) -> Option { + s.map(|v| { + let mut text = v.to_string(); + reverse_grapheme_clusters_in_place(&mut text); + text + }) +} + +pub fn reverse(ca: &Utf8Chunked) -> Utf8Chunked { + ca.apply_generic(to_reverse_helper) +} diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 54cc927bd99a..8a219a0567e7 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -125,6 +125,7 @@ chunked_ids = ["polars-core/chunked_ids"] list_to_struct = ["polars-ops/list_to_struct"] row_hash = ["polars-core/row_hash", "polars-ops/hash"] string_pad = ["polars-ops/string_pad"] +string_reverse = ["polars-ops/string_reverse"] string_to_integer = ["polars-ops/string_to_integer"] arg_where = [] search_sorted = ["polars-ops/search_sorted"] diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 9a850e51ca02..d38e7926921d 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -64,6 +64,8 @@ pub enum StringFunction { n: i64, literal: bool, }, + #[cfg(feature = "string_reverse")] + Reverse, #[cfg(feature = "string_pad")] PadStart { length: usize, @@ -131,6 +133,8 @@ impl StringFunction { LenChars => mapper.with_dtype(DataType::UInt32), #[cfg(feature = "regex")] Replace { .. } => mapper.with_same_dtype(), + #[cfg(feature = "string_reverse")] + Reverse => mapper.with_same_dtype(), #[cfg(feature = "temporal")] Strptime(dtype, _) => mapper.with_dtype(dtype.clone()), Split(_) => mapper.with_dtype(DataType::List(Box::new(DataType::Utf8))), @@ -202,6 +206,8 @@ impl Display for StringFunction { PadStart { .. } => "pad_start", #[cfg(feature = "regex")] Replace { .. } => "replace", + #[cfg(feature = "string_reverse")] + Reverse => "reverse", #[cfg(feature = "string_encoding")] HexEncode => "hex_encode", #[cfg(feature = "binary_encoding")] @@ -303,6 +309,8 @@ impl From for SpecialEq> { ConcatHorizontal(delimiter) => map_as_slice!(strings::concat_hor, &delimiter), #[cfg(feature = "regex")] Replace { n, literal } => map_as_slice!(strings::replace, literal, n), + #[cfg(feature = "string_reverse")] + Reverse => map!(strings::reverse), Uppercase => map!(strings::uppercase), Lowercase => map!(strings::lowercase), #[cfg(feature = "nightly")] @@ -802,6 +810,12 @@ pub(super) fn replace(s: &[Series], literal: bool, n: i64) -> PolarsResult PolarsResult { + let ca = s.utf8()?; + Ok(ca.str_reverse().into_series()) +} + #[cfg(feature = "string_to_integer")] pub(super) fn to_integer(s: &Series, base: u32, strict: bool) -> PolarsResult { let ca = s.utf8()?; diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index 1a4f314f077d..e1171dc17485 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -330,6 +330,17 @@ impl StringNameSpace { ) } + #[cfg(feature = "string_reverse")] + /// Reverse each string + pub fn reverse(self) -> Expr { + self.0.map_many_private( + FunctionExpr::StringExpr(StringFunction::Reverse), + &[], + false, + false, + ) + } + /// Remove leading and trailing characters, or whitespace if matches is None. pub fn strip_chars(self, matches: Expr) -> Expr { self.0.map_many_private( diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index e4ab592402f5..08de3e3e6751 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -172,6 +172,7 @@ list_gather = ["polars-ops/list_gather", "polars-lazy?/list_gather"] describe = ["polars-core/describe"] timezones = ["polars-core/timezones", "polars-lazy?/timezones", "polars-io/timezones"] string_pad = ["polars-lazy?/string_pad", "polars-ops/string_pad"] +string_reverse = ["polars-lazy?/string_reverse", "polars-ops/string_reverse"] string_to_integer = ["polars-lazy?/string_to_integer", "polars-ops/string_to_integer"] arg_where = ["polars-lazy?/arg_where"] search_sorted = ["polars-lazy?/search_sorted"] @@ -315,6 +316,7 @@ docs-selection = [ "asof_join", "cross_join", "concat_str", + "string_reverse", "string_to_integer", "decompress", "mode", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 2ac84cff929c..12d6f4e04ccd 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -76,6 +76,7 @@ features = [ "semi_anti_join", "serde-lazy", "string_encoding", + "string_reverse", "string_to_integer", "string_pad", "strings", diff --git a/py-polars/docs/source/reference/expressions/string.rst b/py-polars/docs/source/reference/expressions/string.rst index 824ea71c7c1a..fa19a7460e25 100644 --- a/py-polars/docs/source/reference/expressions/string.rst +++ b/py-polars/docs/source/reference/expressions/string.rst @@ -33,6 +33,7 @@ The following methods are available under the `expr.str` attribute. Expr.str.pad_start Expr.str.replace Expr.str.replace_all + Expr.str.reverse Expr.str.rjust Expr.str.rstrip Expr.str.slice diff --git a/py-polars/docs/source/reference/series/string.rst b/py-polars/docs/source/reference/series/string.rst index 44d917b23f50..a4a774b5ce3c 100644 --- a/py-polars/docs/source/reference/series/string.rst +++ b/py-polars/docs/source/reference/series/string.rst @@ -33,6 +33,7 @@ The following methods are available under the `Series.str` attribute. Series.str.pad_start Series.str.replace Series.str.replace_all + Series.str.reverse Series.str.rjust Series.str.rstrip Series.str.slice diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 3b8e2e0adfd8..406fc99072f4 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1944,6 +1944,27 @@ def replace_all( value = parse_as_expression(value, str_as_lit=True) return wrap_expr(self._pyexpr.str_replace_all(pattern, value, literal)) + def reverse(self) -> Expr: + """ + Returns string values in reversed order. + + Examples + -------- + >>> df = pl.DataFrame({"text": ["foo", "bar", "man\u0303ana"]}) + >>> df.with_columns(pl.col("text").str.reverse().alias("reversed")) + shape: (3, 2) + ┌────────┬──────────┐ + │ text ┆ reversed │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪══════════╡ + │ foo ┆ oof │ + │ bar ┆ rab │ + │ mañana ┆ anañam │ + └────────┴──────────┘ + """ + return wrap_expr(self._pyexpr.str_reverse()) + def slice(self, offset: int, length: int | None = None) -> Expr: """ Create subslices of the string values of a Utf8 Series. diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 91937f5767f0..ae1a6b742fce 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1441,6 +1441,23 @@ def to_titlecase(self) -> Series: """ + def reverse(self) -> Series: + """ + Returns string values in reversed order. + + Examples + -------- + >>> s = pl.Series("text", ["foo", "bar", "man\u0303ana"]) + >>> s.str.reverse() + shape: (3,) + Series: 'text' [str] + [ + "oof" + "rab" + "anañam" + ] + """ + def slice(self, offset: int, length: int | None = None) -> Series: """ Create subslices of the string values of a Utf8 Series. diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index a82e1dbccc94..08493d1898b8 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -141,6 +141,10 @@ impl PyExpr { .into() } + fn str_reverse(&self) -> Self { + self.inner.clone().str().reverse().into() + } + fn str_pad_start(&self, length: usize, fill_char: char) -> Self { self.inner.clone().str().pad_start(length, fill_char).into() } diff --git a/py-polars/tests/unit/namespaces/string/test_string.py b/py-polars/tests/unit/namespaces/string/test_string.py index b76f84a0409e..d61d195f478b 100644 --- a/py-polars/tests/unit/namespaces/string/test_string.py +++ b/py-polars/tests/unit/namespaces/string/test_string.py @@ -1137,3 +1137,23 @@ def test_string_extract_groups_lazy_schema_10305() -> None: ) assert df.schema == {"candidate": pl.Utf8, "ref": pl.Utf8} + + +def test_string_reverse() -> None: + df = pl.DataFrame( + { + "text": [None, "foo", "bar", "i like pizza&#", None, "man\u0303ana"], + } + ) + expected = pl.DataFrame( + [ + pl.Series( + "text", + [None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"], + dtype=pl.Utf8, + ), + ] + ) + + result = df.select(pl.col("text").str.reverse()) + assert_frame_equal(result, expected)