Skip to content

Commit

Permalink
fix(rust,python): handle edge-case with string-literal replacement wh…
Browse files Browse the repository at this point in the history
…en the replace value looks like a capture group (#6765)
  • Loading branch information
alexander-beedie authored Feb 10, 2023
1 parent aad4aa3 commit 4607eb6
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 13 deletions.
27 changes: 19 additions & 8 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use polars_arrow::export::arrow::compute::substring::substring;
use polars_arrow::export::arrow::{self};
use polars_arrow::kernels::string::*;
use polars_core::export::num::Num;
use polars_core::export::regex::{escape, Regex};
use polars_core::export::regex::{escape, NoExpand, Regex};

use super::*;
#[cfg(feature = "string_encoding")]
Expand Down Expand Up @@ -207,8 +207,8 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
/// Replace the leftmost regex-matched (sub)string with another string; take
/// fast-path for small (<= 32 chars) strings (otherwise regex faster).
fn replace<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
let lit = pat.chars().all(|c| !c.is_ascii_punctuation());
let ca = self.as_utf8();
let lit = !(pat.chars().any(|c| c.is_ascii_punctuation())
| val.chars().any(|c| c.is_ascii_punctuation()));
let reg = Regex::new(pat)?;
let f = |s: &'a str| {
if lit && (s.len() <= 32) {
Expand All @@ -217,25 +217,36 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
reg.replace(s, val)
}
};
let ca = self.as_utf8();
Ok(ca.apply(f))
}

/// Replace the leftmost literal (sub)string with another string
fn replace_literal(&self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
self.replace(escape(pat).as_str(), val)
fn replace_literal<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
let reg = Regex::new(escape(pat).as_str())?;
let f = |s: &'a str| {
if s.len() <= 32 {
Cow::Owned(s.replacen(pat, val, 1))
} else {
reg.replace(s, NoExpand(val))
}
};
let ca = self.as_utf8();
Ok(ca.apply(f))
}

/// Replace all regex-matched (sub)strings with another string
fn replace_all(&self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
let f = |s| reg.replace_all(s, val);
Ok(ca.apply(f))
Ok(ca.apply(|s| reg.replace_all(s, val)))
}

/// Replace all matching literal (sub)strings with another string
fn replace_literal_all(&self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
self.replace_all(escape(pat).as_str(), val)
let ca = self.as_utf8();
let reg = Regex::new(escape(pat).as_str())?;
Ok(ca.apply(|s| reg.replace_all(s, NoExpand(val))))
}

/// Extract the nth capture group from pattern
Expand Down
41 changes: 36 additions & 5 deletions py-polars/tests/unit/namespaces/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,13 @@ def test_replace() -> None:
(r"^\(", "[", True, ["* * text", "(with) special\n * chars **etc...?$"]),
(r"t$", "an", False, ["* * texan", "(with) special\n * chars **etc...?$"]),
(r"t$", "an", True, ["* * text", "(with) special\n * chars **etc...?$"]),
(r"(with) special", "$1", True, ["* * text", "$1\n * chars **etc...?$"]),
(
r"\((with)\) special",
":$1:",
False,
["* * text", ":with:\n * chars **etc...?$"],
),
):
# series
assert (
Expand All @@ -315,23 +322,38 @@ def test_replace() -> None:
)["text"].to_list()
)

assert pl.Series(["."]).str.replace(".", "$0", literal=True)[0] == "$0"
assert pl.Series(["(.)(?)"]).str.replace(".", "$1", literal=True)[0] == "($1)(?)"


def test_replace_all() -> None:
df = pl.DataFrame(
data=[(1, "* * text"), (2, "(with) special * chars **etc...?$")],
data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
schema=["idx", "text"],
orient="row",
)
for pattern, replacement, as_literal, expected in (
(r"\*", "-", False, ["- - text", "(with) special - chars --etc...?$"]),
(r"*", "-", True, ["- - text", "(with) special - chars --etc...?$"]),
(r"\*", "-", False, ["- - text", "(with) special\n - chars --etc...?$"]),
(r"*", "-", True, ["- - text", "(with) special\n - chars --etc...?$"]),
(r"\W", "", False, ["text", "withspecialcharsetc"]),
(r".?$", "", True, ["* * text", "(with) special * chars **etc.."]),
(r".?$", "", True, ["* * text", "(with) special\n * chars **etc.."]),
(
r"(with) special",
"$1",
True,
["* * text", "$1\n * chars **etc...?$"],
),
(
r"\((with)\) special",
":$1:",
False,
["* * text", ":with:\n * chars **etc...?$"],
),
(
r"(\b)[\w\s]{2,}(\b)",
"$1(blah)$3",
False,
["* * (blah)", "((blah)) (blah) * (blah) **(blah)...?$"],
["* * (blah)", "((blah)) (blah)\n * (blah) **(blah)...?$"],
),
):
# series
Expand All @@ -352,6 +374,15 @@ def test_replace_all() -> None:
with pytest.raises(pl.ComputeError):
df["text"].str.replace_all("*", "")

assert (
pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\?", "$0", literal=True)[0]
== "(.)($0)($0)"
)
assert (
pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\?", "$0", literal=False)[0]
== "(.)(\?)(\?)"
)


def test_replace_expressions() -> None:
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})
Expand Down

0 comments on commit 4607eb6

Please sign in to comment.