Add parsing for single quote with identifier (for units)

goncalo · Oct 4, 2022 · b275754 · b275754
1 parent d87cf12
commit b275754
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 10 deletions.
diff --git a/src/compiler/Stark.Compiler.Tests/TestLexer.cs b/src/compiler/Stark.Compiler.Tests/TestLexer.cs
@@ -334,6 +334,40 @@ public void TestRuneEscapeSequences(string escaped, string real)
         });
     }
 
+    [Test]
+    public void TestSingleQuoteWithIdentifier()
+    {
+        //           01234
+        Lexer($" 'a ", new()
+        {
+            (TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
+            (TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
+            (TokenKind.Identifier, new TokenSpan(2, 1, 0, 2), "a"),
+            (TokenKind.WhiteSpace, new TokenSpan(3, 1, 0, 3), null),
+            (TokenKind.Eof, new TokenSpan(4, 0, 0, 4), null),
+        });
+
+        //           0123456
+        Lexer($" 'AbC ", new()
+        {
+            (TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
+            (TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
+            (TokenKind.Identifier, new TokenSpan(2, 3, 0, 2), "AbC"),
+            (TokenKind.WhiteSpace, new TokenSpan(5, 1, 0, 5), null),
+            (TokenKind.Eof, new TokenSpan(6, 0, 0, 6), null),
+        });
+
+        //           0123456
+        Lexer($" '_ ", new()
+        {
+            (TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
+            (TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
+            (TokenKind.Underscore, new TokenSpan(2, 1, 0, 2), null),
+            (TokenKind.WhiteSpace, new TokenSpan(3, 1, 0, 3), null),
+            (TokenKind.Eof, new TokenSpan(4, 0, 0, 4), null),
+        });
+    }
+
     [Test]
     public void TestRuneInvalid()
     {
@@ -353,8 +387,21 @@ public void TestRuneInvalid()
         Lexer($" 'ab' ", new()
         {
             (TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
-            (TokenKind.Rune, new TokenSpan(1, 4, 0, 1), (int)'a'),
-            (TokenKind.WhiteSpace, new TokenSpan(5, 1, 0, 5), null),
+            (TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
+            (TokenKind.Identifier, new TokenSpan(2, 2, 0, 2), "ab"),
+            (TokenKind.Rune, new TokenSpan(4, 2, 0, 4), (int)' '),
+            (TokenKind.Eof, new TokenSpan(6, 0, 0, 6), null),
+        }, new()
+        {
+            (DiagnosticId.ERR_UnexpectedEndOfRune, new TextSpan(new TextLocation(6, 0, 6)))
+        });
+
+        //           0123456
+        Lexer($" '12' ", new()
+        {
+            (TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
+            (TokenKind.Rune, new TokenSpan(1, 4, 0, 1), (int)'1'),
+            (TokenKind.WhiteSpace, new TokenSpan(5, 1, 0, 5), "ab"),
             (TokenKind.Eof, new TokenSpan(6, 0, 0, 6), null),
         }, new()
         {

diff --git a/src/compiler/Stark.Compiler/Diagnostics/DiagnosticId.cs b/src/compiler/Stark.Compiler/Diagnostics/DiagnosticId.cs
@@ -30,6 +30,7 @@ public enum DiagnosticId
     ERR_InvalidUtf8InRune = 115, // (int c) => Invalid Unicode found in rune `\\U{c:x8}` must be between `\\U00000000` to `\\U0010FFFF`.
     ERR_InvalidRuneTooManyCharacters = 116, // Invalid rune. Too many characters. Expecting a single rune.
     ERR_InvalidRuneCannotBeEmpty = 117, // Invalid rune. A rune cannot be empty.
+    ERR_UnexpectedEndOfRune = 118, // Unexpected end of rune without a terminating \'.
 
     ERR_UnexpectedUnderscoreAfterDigit = 120, // Unexpected underscore found after digit. They can only be enclosed by digits.
     ERR_UnexpectedCharacterAfterDot = 121, // (string c) => Unexpected character `{c}` found after a dot while parsing a float. Expecting a digit 0-9.

diff --git a/src/compiler/Stark.Compiler/Helpers/Utf8Helper.cs b/src/compiler/Stark.Compiler/Helpers/Utf8Helper.cs
@@ -19,6 +19,9 @@ internal static class Utf8Helper
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static bool IsLetter(byte b) => b >= (byte)'A' && b <= (byte)'Z' || b >= (byte)'a' && b <= (byte)'z';
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static bool IsLetterOrUnderscore(byte b) => IsLetter(b) || b == (byte)'_';
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static bool IsLetterContinuationForIdentifier(byte b) => IsDigit(b) || IsLetter(b) || b == (byte)'_';

diff --git a/src/compiler/Stark.Compiler/Parsing/Lexer.cs b/src/compiler/Stark.Compiler/Parsing/Lexer.cs
@@ -524,8 +524,15 @@ private static void ParseMultiLineStringContent(Lexer lexer, int startStringToke
         }
     }
 
-    private static unsafe byte* ParseRune(Lexer lexer, byte* ptr, byte c)
+    private static unsafe byte* ParseSingleQuote(Lexer lexer, byte* ptr, byte c)
     {
+        // This is a single quote followed by an identifier
+        if (Utf8Helper.IsLetterOrUnderscore(ptr[1]) && ptr[2] != '\'')
+        {
+            return ParseSymbol1Byte(lexer, ptr, c);
+        }
+
+        // This must be a rune character
         return ParseSingleLineString(lexer, ptr, ptr, c, 0);
     }
 
@@ -759,7 +766,7 @@ private static void ParseMultiLineStringContent(Lexer lexer, int startStringToke
             }
             else if (c == Eof || c == '\r' || c == '\n')
             {
-                lexer.LogError(ERR_UnexpectedEndOfString(), ptr, column);
+                lexer.LogError(startChar == '\'' ? ERR_UnexpectedEndOfRune() : ERR_UnexpectedEndOfString(), ptr, column);
                 break;
             }
             else
@@ -1665,11 +1672,10 @@ private static unsafe bool TryParseUtf8(ref byte* ptr, out Rune result)
     private static unsafe byte* ParseSymbol1Byte(Lexer lexer, byte* ptr, byte c)
     {
         var offset = (uint)(ptr - lexer._originalPtr);
-        ptr++;
         var kind = Symbol1ByteToTokenKind[(byte)Utf8Helper.GetClassFromByte(c)];
         Debug.Assert(kind != TokenKind.Invalid);
         lexer.AddToken(kind, new TokenSpan(offset, 1, lexer._line, lexer._column));
-
+        ptr++;
         lexer._column++;
         return ptr;
     }
@@ -1907,7 +1913,7 @@ private int GetHashCode(ReadOnlySpan<byte> data)
         &ParseDollar, // DollarSign,            // $
         &ParseSymbolMultiBytes, // PercentSign,           // %
         &ParseSymbolMultiBytes, // Ampersand,             // &
-        &ParseRune, // SingleQuote,           // '
+        &ParseSingleQuote, // SingleQuote,           // '
         &ParseSymbol1Byte, // LeftParenthesis,       // (
         &ParseSymbol1Byte, // RightParenthesis,      // )
         &ParseSymbolMultiBytes, // Asterisk,              // *
@@ -1956,7 +1962,7 @@ private int GetHashCode(ReadOnlySpan<byte> data)
         TokenKind.Invalid, // DollarSign,            // $                               DollarSign,            // $
         TokenKind.Invalid, // PercentSign,           // %                                        PercentSign,           // %
         TokenKind.Invalid, // Ampersand,             // &                                        Ampersand,             // &
-        TokenKind.Invalid, // SingleQuote,           // '                                        SingleQuote,           // '
+        TokenKind.SingleQuote, // SingleQuote,           // '                                        SingleQuote,           // '
         TokenKind.LeftParent, // LeftParenthesis,       // (                          LeftParenthesis,       // (
         TokenKind.RightParent, // RightParenthesis,      // )                         RightParenthesis,      // )
         TokenKind.Invalid, // Asterisk,              // *                                        Asterisk,              // *

diff --git a/src/compiler/Stark.Compiler/Syntax/TokenKind.cs b/src/compiler/Stark.Compiler/Syntax/TokenKind.cs
@@ -4,10 +4,24 @@
 
 namespace Stark.Compiler.Syntax;
 
+/// <summary>
+/// The kind of a token.
+/// </summary>
 public enum TokenKind : byte
 {
+    /// <summary>
+    /// An invalid character.
+    /// </summary>
     Invalid = 0,
+
+    /// <summary>
+    /// An invalid UTF8 character.
+    /// </summary>
     InvalidUtf8,
+
+    /// <summary>
+    /// An invalid TAB character.
+    /// </summary>
     InvalidTab,
 
     Eof,
@@ -33,12 +47,12 @@ public enum TokenKind : byte
 
     // 1 byte symbols
     Exclamation,        // !
-    DoubleQuote,        // "
+    // DoubleQuote,        // " not used alone, only used through string
     Number,             // #
     Dollar,             // $
     Percent,            // %
     Ampersand,          // &
-    SingleQuote,        // '
+    SingleQuote,        // ' used for unit
     LeftParent,         // (
     RightParent,        // )
     Star,               // *