Skip to content

Commit

Permalink
bytes: escape invalid UTF-8 bytes in debug output for Match
Browse files Browse the repository at this point in the history
  • Loading branch information
notJoon committed Jun 9, 2024
1 parent ab4c8d1 commit 1f9f9cc
Showing 1 changed file with 91 additions and 11 deletions.
102 changes: 91 additions & 11 deletions src/regex/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1555,18 +1555,13 @@ impl<'h> Match<'h> {

impl<'h> core::fmt::Debug for Match<'h> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use regex_automata::util::escape::DebugHaystack;

let mut fmt = f.debug_struct("Match");
fmt.field("start", &self.start).field("end", &self.end);
if let Ok(s) = core::str::from_utf8(self.as_bytes()) {
fmt.field("bytes", &s);
} else {
// FIXME: It would be nice if this could be printed as a string
// with invalid UTF-8 replaced with hex escapes. A alloc would
// probably okay if that makes it easier, but regex-automata does
// (at time of writing) have internal routines that do this. So
// maybe we should expose them.
fmt.field("bytes", &self.as_bytes());
}
fmt.field("start", &self.start)
.field("end", &self.end)
.field("bytes", &DebugHaystack(&self.as_bytes()));

fmt.finish()
}
}
Expand Down Expand Up @@ -2620,3 +2615,88 @@ fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> {
None => Some(Cow::Borrowed(replacement)),
}
}

#[cfg(test)]
mod tests {
use super::*;
use alloc::format;

#[test]
fn test_match_properties() {
let haystack = b"Hello, world!";
let m = Match::new(haystack, 7, 12);

assert_eq!(m.start(), 7);
assert_eq!(m.end(), 12);
assert_eq!(m.is_empty(), false);
assert_eq!(m.len(), 5);
assert_eq!(m.as_bytes(), b"world");
}

#[test]
fn test_empty_match() {
let haystack = b"";
let m = Match::new(haystack, 0, 0);

assert_eq!(m.is_empty(), true);
assert_eq!(m.len(), 0);
}

#[test]
fn test_debug_output_valid_utf8() {
let haystack = b"Hello, world!";
let m = Match::new(haystack, 7, 12);
let debug_str = format!("{:?}", m);

assert_eq!(
debug_str,
r#"Match { start: 7, end: 12, bytes: "world" }"#
);
}

#[test]
fn test_debug_output_invalid_utf8() {
let haystack = b"Hello, \xFFworld!";
let m = Match::new(haystack, 7, 13);
let debug_str = format!("{:?}", m);

assert_eq!(
debug_str,
r#"Match { start: 7, end: 13, bytes: "\xffworld" }"#
);
}

#[test]
fn test_debug_output_various_unicode() {
let haystack =
"Hello, 😊 world! 안녕하세요? مرحبا بالعالم!".as_bytes();
let m = Match::new(haystack, 0, haystack.len());
let debug_str = format!("{:?}", m);

assert_eq!(
debug_str,
r#"Match { start: 0, end: 62, bytes: "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!" }"#
);
}

#[test]
fn test_debug_output_ascii_escape() {
let haystack = b"Hello,\tworld!\nThis is a \x1b[31mtest\x1b[0m.";
let m = Match::new(haystack, 0, haystack.len());
let debug_str = format!("{:?}", m);

assert_eq!(
debug_str,
r#"Match { start: 0, end: 38, bytes: "Hello,\tworld!\nThis is a \u{1b}[31mtest\u{1b}[0m." }"#
);
}

#[test]
fn test_debug_output_match_in_middle() {
let haystack = b"The quick brown fox jumps over the lazy dog.";
let m = Match::new(haystack, 16, 19);
let debug_str = format!("{:?}", m);

assert_eq!(debug_str, r#"Match { start: 16, end: 19, bytes: "fox" }"#);
}
}

0 comments on commit 1f9f9cc

Please sign in to comment.