From 7f74f37f49ac24ff2e425b4d4edb5b63e56fed42 Mon Sep 17 00:00:00 2001 From: A-Walrus Date: Fri, 16 Jun 2023 16:17:10 +0300 Subject: [PATCH 1/3] Fix unicode bug in LanguageLayer Call as bytes before slicing, that way you can take bytes that aren't aligned to chars. Should technically also be slightly faster since you don't have to check alignment... --- helix-core/src/syntax.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index 2a5bb974d249..f43b03ade7ac 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -1402,7 +1402,7 @@ impl LanguageLayer { &mut |byte, _| { if byte <= source.len_bytes() { let (chunk, start_byte, _, _) = source.chunk_at_byte(byte); - chunk[byte - start_byte..].as_bytes() + &chunk.as_bytes()[byte - start_byte..] } else { // out of range &[] From 8d2c6d2aa4afd43ec480e83a3f59660b7bf57442 Mon Sep 17 00:00:00 2001 From: A-Walrus Date: Wed, 21 Jun 2023 16:15:38 +0300 Subject: [PATCH 2/3] Fix byte grapheme UTF-8 bug --- helix-core/src/graphemes.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/helix-core/src/graphemes.rs b/helix-core/src/graphemes.rs index 15ef3eb043e8..0ee79f8608ff 100644 --- a/helix-core/src/graphemes.rs +++ b/helix-core/src/graphemes.rs @@ -114,6 +114,12 @@ pub fn grapheme_width(g: &str) -> usize { } } +/// Byte index aligned to start of the char that it is in. If byte is the start +/// of a char it will remain unchanged. +fn aligned_byte_index(slice: RopeSlice, byte_idx: usize) -> usize { + slice.char_to_byte(slice.byte_to_char(byte_idx)) +} + #[must_use] pub fn nth_prev_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize { // Bounds check @@ -204,7 +210,9 @@ pub fn nth_next_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) - } #[must_use] -pub fn nth_next_grapheme_boundary_byte(slice: RopeSlice, mut byte_idx: usize, n: usize) -> usize { +pub fn nth_next_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize, n: usize) -> usize { + let mut byte_idx = aligned_byte_index(slice, byte_idx); + // Bounds check debug_assert!(byte_idx <= slice.len_bytes()); @@ -326,6 +334,11 @@ pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool { /// Returns whether the given byte position is a grapheme boundary. #[must_use] pub fn is_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> bool { + if aligned_byte_index(slice, byte_idx) != byte_idx { + // byte is not start of char, so definently not start of grapheme + return false; + } + // Bounds check debug_assert!(byte_idx <= slice.len_bytes()); From 82ce2f3d2e28f0bc1478cc54caeb7d14804cdcdc Mon Sep 17 00:00:00 2001 From: A-Walrus Date: Wed, 21 Jun 2023 17:33:44 +0300 Subject: [PATCH 3/3] Revert "Fix byte grapheme UTF-8 bug" This reverts commit 8d2c6d2aa4afd43ec480e83a3f59660b7bf57442. --- helix-core/src/graphemes.rs | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/helix-core/src/graphemes.rs b/helix-core/src/graphemes.rs index 0ee79f8608ff..15ef3eb043e8 100644 --- a/helix-core/src/graphemes.rs +++ b/helix-core/src/graphemes.rs @@ -114,12 +114,6 @@ pub fn grapheme_width(g: &str) -> usize { } } -/// Byte index aligned to start of the char that it is in. If byte is the start -/// of a char it will remain unchanged. -fn aligned_byte_index(slice: RopeSlice, byte_idx: usize) -> usize { - slice.char_to_byte(slice.byte_to_char(byte_idx)) -} - #[must_use] pub fn nth_prev_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize { // Bounds check @@ -210,9 +204,7 @@ pub fn nth_next_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) - } #[must_use] -pub fn nth_next_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize, n: usize) -> usize { - let mut byte_idx = aligned_byte_index(slice, byte_idx); - +pub fn nth_next_grapheme_boundary_byte(slice: RopeSlice, mut byte_idx: usize, n: usize) -> usize { // Bounds check debug_assert!(byte_idx <= slice.len_bytes()); @@ -334,11 +326,6 @@ pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool { /// Returns whether the given byte position is a grapheme boundary. #[must_use] pub fn is_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> bool { - if aligned_byte_index(slice, byte_idx) != byte_idx { - // byte is not start of char, so definently not start of grapheme - return false; - } - // Bounds check debug_assert!(byte_idx <= slice.len_bytes());