Skip to content

Commit

Permalink
Add parsing for single quote with identifier (for units)
Browse files Browse the repository at this point in the history
  • Loading branch information
xoofx committed Oct 4, 2022
1 parent d87cf12 commit b275754
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 10 deletions.
51 changes: 49 additions & 2 deletions src/compiler/Stark.Compiler.Tests/TestLexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,40 @@ public void TestRuneEscapeSequences(string escaped, string real)
});
}

[Test]
public void TestSingleQuoteWithIdentifier()
{
// 01234
Lexer($" 'a ", new()
{
(TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
(TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
(TokenKind.Identifier, new TokenSpan(2, 1, 0, 2), "a"),
(TokenKind.WhiteSpace, new TokenSpan(3, 1, 0, 3), null),
(TokenKind.Eof, new TokenSpan(4, 0, 0, 4), null),
});

// 0123456
Lexer($" 'AbC ", new()
{
(TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
(TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
(TokenKind.Identifier, new TokenSpan(2, 3, 0, 2), "AbC"),
(TokenKind.WhiteSpace, new TokenSpan(5, 1, 0, 5), null),
(TokenKind.Eof, new TokenSpan(6, 0, 0, 6), null),
});

// 0123456
Lexer($" '_ ", new()
{
(TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
(TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
(TokenKind.Underscore, new TokenSpan(2, 1, 0, 2), null),
(TokenKind.WhiteSpace, new TokenSpan(3, 1, 0, 3), null),
(TokenKind.Eof, new TokenSpan(4, 0, 0, 4), null),
});
}

[Test]
public void TestRuneInvalid()
{
Expand All @@ -353,8 +387,21 @@ public void TestRuneInvalid()
Lexer($" 'ab' ", new()
{
(TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
(TokenKind.Rune, new TokenSpan(1, 4, 0, 1), (int)'a'),
(TokenKind.WhiteSpace, new TokenSpan(5, 1, 0, 5), null),
(TokenKind.SingleQuote, new TokenSpan(1, 1, 0, 1), null),
(TokenKind.Identifier, new TokenSpan(2, 2, 0, 2), "ab"),
(TokenKind.Rune, new TokenSpan(4, 2, 0, 4), (int)' '),
(TokenKind.Eof, new TokenSpan(6, 0, 0, 6), null),
}, new()
{
(DiagnosticId.ERR_UnexpectedEndOfRune, new TextSpan(new TextLocation(6, 0, 6)))
});

// 0123456
Lexer($" '12' ", new()
{
(TokenKind.WhiteSpace, new TokenSpan(0, 1, 0, 0), null),
(TokenKind.Rune, new TokenSpan(1, 4, 0, 1), (int)'1'),
(TokenKind.WhiteSpace, new TokenSpan(5, 1, 0, 5), "ab"),
(TokenKind.Eof, new TokenSpan(6, 0, 0, 6), null),
}, new()
{
Expand Down
1 change: 1 addition & 0 deletions src/compiler/Stark.Compiler/Diagnostics/DiagnosticId.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public enum DiagnosticId
ERR_InvalidUtf8InRune = 115, // (int c) => Invalid Unicode found in rune `\\U{c:x8}` must be between `\\U00000000` to `\\U0010FFFF`.
ERR_InvalidRuneTooManyCharacters = 116, // Invalid rune. Too many characters. Expecting a single rune.
ERR_InvalidRuneCannotBeEmpty = 117, // Invalid rune. A rune cannot be empty.
ERR_UnexpectedEndOfRune = 118, // Unexpected end of rune without a terminating \'.

ERR_UnexpectedUnderscoreAfterDigit = 120, // Unexpected underscore found after digit. They can only be enclosed by digits.
ERR_UnexpectedCharacterAfterDot = 121, // (string c) => Unexpected character `{c}` found after a dot while parsing a float. Expecting a digit 0-9.
Expand Down
3 changes: 3 additions & 0 deletions src/compiler/Stark.Compiler/Helpers/Utf8Helper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ internal static class Utf8Helper

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsLetter(byte b) => b >= (byte)'A' && b <= (byte)'Z' || b >= (byte)'a' && b <= (byte)'z';

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsLetterOrUnderscore(byte b) => IsLetter(b) || b == (byte)'_';

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsLetterContinuationForIdentifier(byte b) => IsDigit(b) || IsLetter(b) || b == (byte)'_';
Expand Down
18 changes: 12 additions & 6 deletions src/compiler/Stark.Compiler/Parsing/Lexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -524,8 +524,15 @@ private static void ParseMultiLineStringContent(Lexer lexer, int startStringToke
}
}

private static unsafe byte* ParseRune(Lexer lexer, byte* ptr, byte c)
private static unsafe byte* ParseSingleQuote(Lexer lexer, byte* ptr, byte c)
{
// This is a single quote followed by an identifier
if (Utf8Helper.IsLetterOrUnderscore(ptr[1]) && ptr[2] != '\'')
{
return ParseSymbol1Byte(lexer, ptr, c);
}

// This must be a rune character
return ParseSingleLineString(lexer, ptr, ptr, c, 0);
}

Expand Down Expand Up @@ -759,7 +766,7 @@ private static void ParseMultiLineStringContent(Lexer lexer, int startStringToke
}
else if (c == Eof || c == '\r' || c == '\n')
{
lexer.LogError(ERR_UnexpectedEndOfString(), ptr, column);
lexer.LogError(startChar == '\'' ? ERR_UnexpectedEndOfRune() : ERR_UnexpectedEndOfString(), ptr, column);
break;
}
else
Expand Down Expand Up @@ -1665,11 +1672,10 @@ private static unsafe bool TryParseUtf8(ref byte* ptr, out Rune result)
private static unsafe byte* ParseSymbol1Byte(Lexer lexer, byte* ptr, byte c)
{
var offset = (uint)(ptr - lexer._originalPtr);
ptr++;
var kind = Symbol1ByteToTokenKind[(byte)Utf8Helper.GetClassFromByte(c)];
Debug.Assert(kind != TokenKind.Invalid);
lexer.AddToken(kind, new TokenSpan(offset, 1, lexer._line, lexer._column));

ptr++;
lexer._column++;
return ptr;
}
Expand Down Expand Up @@ -1907,7 +1913,7 @@ private int GetHashCode(ReadOnlySpan<byte> data)
&ParseDollar, // DollarSign, // $
&ParseSymbolMultiBytes, // PercentSign, // %
&ParseSymbolMultiBytes, // Ampersand, // &
&ParseRune, // SingleQuote, // '
&ParseSingleQuote, // SingleQuote, // '
&ParseSymbol1Byte, // LeftParenthesis, // (
&ParseSymbol1Byte, // RightParenthesis, // )
&ParseSymbolMultiBytes, // Asterisk, // *
Expand Down Expand Up @@ -1956,7 +1962,7 @@ private int GetHashCode(ReadOnlySpan<byte> data)
TokenKind.Invalid, // DollarSign, // $ DollarSign, // $
TokenKind.Invalid, // PercentSign, // % PercentSign, // %
TokenKind.Invalid, // Ampersand, // & Ampersand, // &
TokenKind.Invalid, // SingleQuote, // ' SingleQuote, // '
TokenKind.SingleQuote, // SingleQuote, // ' SingleQuote, // '
TokenKind.LeftParent, // LeftParenthesis, // ( LeftParenthesis, // (
TokenKind.RightParent, // RightParenthesis, // ) RightParenthesis, // )
TokenKind.Invalid, // Asterisk, // * Asterisk, // *
Expand Down
18 changes: 16 additions & 2 deletions src/compiler/Stark.Compiler/Syntax/TokenKind.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,24 @@

namespace Stark.Compiler.Syntax;

/// <summary>
/// The kind of a token.
/// </summary>
public enum TokenKind : byte
{
/// <summary>
/// An invalid character.
/// </summary>
Invalid = 0,

/// <summary>
/// An invalid UTF8 character.
/// </summary>
InvalidUtf8,

/// <summary>
/// An invalid TAB character.
/// </summary>
InvalidTab,

Eof,
Expand All @@ -33,12 +47,12 @@ public enum TokenKind : byte

// 1 byte symbols
Exclamation, // !
DoubleQuote, // "
// DoubleQuote, // " not used alone, only used through string
Number, // #
Dollar, // $
Percent, // %
Ampersand, // &
SingleQuote, // '
SingleQuote, // ' used for unit
LeftParent, // (
RightParent, // )
Star, // *
Expand Down

0 comments on commit b275754

Please sign in to comment.