Skip to content

Commit

Permalink
feat: Expressify str.json_path_match (#15764)
Browse files Browse the repository at this point in the history
  • Loading branch information
reswqa authored Apr 19, 2024
1 parent 4a870cc commit 7540f98
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 22 deletions.
45 changes: 36 additions & 9 deletions crates/polars-ops/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@ use std::borrow::Cow;

use arrow::array::ValueSize;
use jsonpath_lib::PathCompiled;
use polars_core::prelude::arity::{broadcast_try_binary_elementwise, unary_elementwise};
use serde_json::Value;

use super::*;

pub fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, str>> {
pub fn extract_json(expr: &PathCompiled, json_str: &str) -> Option<String> {
serde_json::from_str(json_str).ok().and_then(|value| {
// TODO: a lot of heap allocations here. Improve json path by adding a take?
let result = expr.select(&value).ok()?;
let first = *result.first()?;

match first {
Value::String(s) => Some(Cow::Owned(s.clone())),
Value::String(s) => Some(s.clone()),
Value::Null => None,
v => Some(Cow::Owned(v.to_string())),
v => Some(v.to_string()),
}
})
}
Expand All @@ -41,12 +42,38 @@ pub fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a,
pub trait Utf8JsonPathImpl: AsString {
/// Extract json path, first match
/// Refer to <https://goessner.net/articles/JsonPath/>
fn json_path_match(&self, json_path: &str) -> PolarsResult<StringChunked> {
let pat = PathCompiled::compile(json_path)
.map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression {}", e))?;
Ok(self
.as_string()
.apply(|opt_s| opt_s.and_then(|s| extract_json(&pat, s))))
fn json_path_match(&self, json_path: &StringChunked) -> PolarsResult<StringChunked> {
let ca = self.as_string();
match (ca.len(), json_path.len()) {
(_, 1) => {
// SAFETY: `json_path` was verified to have exactly 1 element.
let opt_path = unsafe { json_path.get_unchecked(0) };
let out = if let Some(path) = opt_path {
let pat = PathCompiled::compile(path).map_err(
|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e),
)?;
unary_elementwise(ca, |opt_s| opt_s.and_then(|s| extract_json(&pat, s)))
} else {
StringChunked::full_null(ca.name(), ca.len())
};
Ok(out)
},
(len_ca, len_path) if len_ca == 1 || len_ca == len_path => {
broadcast_try_binary_elementwise(ca, json_path, |opt_str, opt_path| {
match (opt_str, opt_path) {
(Some(str_val), Some(path)) => {
PathCompiled::compile(path)
.map_err(|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e))
.map(|path| extract_json(&path, str_val))
},
_ => Ok(None),
}
})
},
(len_ca, len_path) => {
polars_bail!(ComputeError: "The length of `ca` and `json_path` should either 1 or the same, but `{}`, `{}` founded", len_ca, len_path)
},
}
}

/// Returns the inferred DataType for JSON values for each row
Expand Down
13 changes: 7 additions & 6 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub enum StringFunction {
infer_schema_len: Option<usize>,
},
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(String),
JsonPathMatch,
#[cfg(feature = "regex")]
Replace {
// negative is replace all
Expand Down Expand Up @@ -149,7 +149,7 @@ impl StringFunction {
#[cfg(feature = "extract_jsonpath")]
JsonDecode { dtype, .. } => mapper.with_opt_dtype(dtype.clone()),
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(_) => mapper.with_dtype(DataType::String),
JsonPathMatch => mapper.with_dtype(DataType::String),
LenBytes => mapper.with_dtype(DataType::UInt32),
LenChars => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "regex")]
Expand Down Expand Up @@ -221,7 +221,7 @@ impl Display for StringFunction {
#[cfg(feature = "extract_jsonpath")]
JsonDecode { .. } => "json_decode",
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(_) => "json_path_match",
JsonPathMatch => "json_path_match",
LenBytes => "len_bytes",
Lowercase => "lowercase",
LenChars => "len_chars",
Expand Down Expand Up @@ -374,7 +374,7 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
infer_schema_len,
} => map!(strings::json_decode, dtype.clone(), infer_schema_len),
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(pat) => map!(strings::json_path_match, &pat),
JsonPathMatch => map_as_slice!(strings::json_path_match),
#[cfg(feature = "find_many")]
ContainsMany {
ascii_case_insensitive,
Expand Down Expand Up @@ -994,7 +994,8 @@ pub(super) fn json_decode(
}

#[cfg(feature = "extract_jsonpath")]
pub(super) fn json_path_match(s: &Series, pat: &str) -> PolarsResult<Series> {
let ca = s.str()?;
pub(super) fn json_path_match(s: &[Series]) -> PolarsResult<Series> {
let ca = s[0].str()?;
let pat = s[1].str()?;
Ok(ca.json_path_match(pat)?.into_series())
}
10 changes: 7 additions & 3 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,12 @@ impl StringNameSpace {
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_path_match(self, pat: String) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::JsonPathMatch(pat)))
pub fn json_path_match(self, pat: Expr) -> Expr {
self.0.map_many_private(
FunctionExpr::StringExpr(StringFunction::JsonPathMatch),
&[pat],
false,
false,
)
}
}
3 changes: 2 additions & 1 deletion py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1304,7 +1304,7 @@ def json_decode(
dtype = py_type_to_dtype(dtype)
return wrap_expr(self._pyexpr.str_json_decode(dtype, infer_schema_length))

def json_path_match(self, json_path: str) -> Expr:
def json_path_match(self, json_path: IntoExprColumn) -> Expr:
"""
Extract the first match of JSON string with the provided JSONPath expression.
Expand Down Expand Up @@ -1345,6 +1345,7 @@ def json_path_match(self, json_path: str) -> Expr:
│ {"a":true} ┆ true │
└────────────┴─────────┘
"""
json_path = parse_as_expression(json_path, str_as_lit=True)
return wrap_expr(self._pyexpr.str_json_path_match(json_path))

def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Expr:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ def json_decode(
]
"""

def json_path_match(self, json_path: str) -> Series:
def json_path_match(self, json_path: IntoExprColumn) -> Series:
"""
Extract the first match of json string with provided JSONPath expression.
Expand Down
4 changes: 2 additions & 2 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,8 @@ impl PyExpr {
}

#[cfg(feature = "extract_jsonpath")]
fn str_json_path_match(&self, pat: String) -> Self {
self.inner.clone().str().json_path_match(pat).into()
fn str_json_path_match(&self, pat: Self) -> Self {
self.inner.clone().str().json_path_match(pat.inner).into()
}

fn str_extract(&self, pat: Self, group_index: usize) -> Self {
Expand Down
28 changes: 28 additions & 0 deletions py-polars/tests/unit/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,34 @@ def test_jsonpath_single() -> None:
assert_series_equal(s.str.json_path_match("$.a"), expected)


def test_json_path_match() -> None:
df = pl.DataFrame(
{
"str": [
'{"a":"1"}',
None,
'{"b":2}',
'{"a":2.1, "b": "hello"}',
'{"a":true}',
],
"pat": ["$.a", "$.a", "$.b", "$.b", None],
}
)
out = df.select(
all_expr=pl.col("str").str.json_path_match(pl.col("pat")),
str_expr=pl.col("str").str.json_path_match("$.a"),
pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),
)
expected = pl.DataFrame(
{
"all_expr": ["1", None, "2", "hello", None],
"str_expr": ["1", None, None, "2.1", "true"],
"pat_expr": ["1.1", "1.1", "10", "10", None],
}
)
assert_frame_equal(out, expected)


def test_extract_regex() -> None:
s = pl.Series(
[
Expand Down

0 comments on commit 7540f98

Please sign in to comment.