Unicode computed location fix

Reviewed By: captbaritone Differential Revision: D63879176 fbshipit-source-id: 9093a8916524dad92cab7a4d91bf6917218ee95c
facebook · Oct 4, 2024 · 524f5c4 · 524f5c4
1 parent b02a3f9
commit 524f5c4
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 8 deletions.
diff --git a/compiler/crates/common/src/text_source.rs b/compiler/crates/common/src/text_source.rs
@@ -74,6 +74,7 @@ impl TextSource {
     /**
      * Converts span, which is the relative indices of characters within this text source,
      * into the equivalent line and character number range.
+     * Span is bytes, not characters.
      */
     pub fn to_span_range(&self, span: Span) -> lsp_types::Range {
         let start = span.start as usize;
@@ -84,13 +85,15 @@ impl TextSource {
         let mut character = self.column_index;
         let mut start_position = lsp_types::Position::default();
         let mut end_position = lsp_types::Position::default();
-        let mut chars = self.text.chars().enumerate().peekable();
+        let mut chars = self.text.chars().peekable();
+
+        let mut bytes_seen = 0;
 
-        while let Some((index, chr)) = chars.next() {
-            if index == start {
+        while let Some(chr) = chars.next() {
+            if bytes_seen == start {
                 start_position = lsp_types::Position::new(line as u32, character as u32);
             }
-            if index == end {
+            if bytes_seen == end {
                 end_position = lsp_types::Position::new(line as u32, character as u32);
                 break;
             }
@@ -99,7 +102,7 @@ impl TextSource {
                 // Line terminators: https://www.ecma-international.org/ecma-262/#sec-line-terminators
                 '\u{000A}' | '\u{000D}' | '\u{2028}' | '\u{2029}' => {
                     // <CLRF>
-                    !matches!((chr, chars.peek()), ('\u{000D}', Some((_, '\u{000D}'))))
+                    !matches!((chr, chars.peek()), ('\u{000D}', Some('\u{000D}')))
                 }
                 _ => false,
             };
@@ -112,6 +115,7 @@ impl TextSource {
             } else {
                 character += 1;
             }
+            bytes_seen += chr.len_utf8();
         }
 
         if start_position != lsp_types::Position::default()
@@ -138,6 +142,15 @@ mod test {
         assert_eq!(range.end, lsp_types::Position::new(0, 5));
     }
 
+    #[test]
+    fn to_range_unicode_test() {
+        let span = Span::new(0, 5);
+        let text_source = TextSource::new("☃ource", 0, 0);
+        let range = text_source.to_span_range(span);
+        assert_eq!(range.start, lsp_types::Position::new(0, 0));
+        assert_eq!(range.end, lsp_types::Position::new(0, 3));
+    }
+
     #[test]
     fn to_range_multi_line_test() {
         // this range contains all characters of `fn foo ...`

diff --git a/compiler/crates/docblock-syntax/src/lib.rs b/compiler/crates/docblock-syntax/src/lib.rs
@@ -85,7 +85,7 @@ pub fn parse_docblock(
  * strings with quotation marks.
  *
  * To account for this, we parse in a single pass, essentially treating each
- * character as a token. This allows us to easily intemperate characters
+ * character as a token. This allows us to easily interpret characters
  * differently in different contexts.
  */
 struct DocblockParser<'a> {
@@ -283,7 +283,7 @@ impl<'a> DocblockParser<'a> {
 
     fn next(&mut self) {
         self.chars.next();
-        self.offset += 1;
+        self.offset += 1; // Is this correct for unicode characters?
     }
 
     /// Advance over a string of characters matching predicate.
@@ -307,7 +307,7 @@ impl<'a> DocblockParser<'a> {
                 break;
             }
         }
-        self.offset += result.len() as u32;
+        self.offset += result.len() as u32; // result.len() returns byte length
         result
     }