Add an initial Bison parser

thebesttv · Apr 7, 2022 · 7bb7ec1 · 7bb7ec1
1 parent d074303
commit 7bb7ec1
Show file tree

Hide file tree

Showing 4 changed files with 223 additions and 108 deletions.
diff --git a/build0.sh b/build0.sh
@@ -13,5 +13,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
 # Generate a Fortran ASR from ASR.asdl (C++)
 python grammar/asdl_cpp.py grammar/ASR.asdl src/libasr/asr.h
 
-# Generate the tokenizer
+# Generate the tokenizer and parser
 (cd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)
+(cd src/lpython/parser && bison -Wall -d -r all parser.yy)
+
+grep -n "'" src/lpython/parser/parser.yy && echo "Single quote not allowed" && exit 1
+echo "OK"
diff --git a/ci/build.xsh b/ci/build.xsh
@@ -33,8 +33,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
 # Generate a Python AST from Python.asdl (Python)
 python grammar/asdl_py.py
 
-# Generate the tokenizer
+# Generate the tokenizer and parser
 pushd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp && popd
+pushd src/lpython/parser && bison -Wall -d -r all parser.yy && popd
 
 $lpython_version=$(cat version).strip()
 $dest="lpython-" + $lpython_version

diff --git a/src/lpython/parser/parser.tab.hh b/src/lpython/parser/parser.tab.hh
diff --git a/src/lpython/parser/parser.yy b/src/lpython/parser/parser.yy
@@ -0,0 +1,216 @@
+%require "3.0"
+%define api.pure
+%define api.value.type {LFortran::YYSTYPE}
+//%param {LFortran::Parser &p}
+%locations
+%expect    0   // shift/reduce conflicts
+
+// Uncomment this to get verbose error messages
+//%define parse.error verbose
+
+/*
+// Uncomment this to enable parser tracing. Then in the main code, set
+// extern int yydebug;
+// yydebug=1;
+%define parse.trace
+%printer { fprintf(yyo, "%s", $$.str().c_str()); } <string>
+%printer { fprintf(yyo, "%d", $$); } <n>
+%printer { std::cerr << "AST TYPE: " << $$->type; } <ast>
+*/
+
+
+%code requires // *.h
+{
+//#include <lpython/parser/parser.h>
+}
+
+%code // *.cpp
+{
+
+#include <lpython/parser/parser.h>
+#include <lpython/parser/tokenizer.h>
+#include <lpython/parser/semantics.h>
+
+/*
+int yylex(LFortran::YYSTYPE *yylval, YYLTYPE *yyloc, LFortran::Parser &p)
+{
+    return p.m_tokenizer.lex(p.m_a, *yylval, *yyloc, p.diag);
+} // ylex
+
+void yyerror(YYLTYPE *yyloc, LFortran::Parser &p, const std::string &msg)
+{
+    p.handle_yyerror(*yyloc, msg);
+}
+*/
+
+#define YYLLOC_DEFAULT(Current, Rhs, N)                                 \
+    do                                                                  \
+      if (N)                                                            \
+        {                                                               \
+          (Current).first   = YYRHSLOC (Rhs, 1).first;                  \
+          (Current).last    = YYRHSLOC (Rhs, N).last;                   \
+        }                                                               \
+      else                                                              \
+        {                                                               \
+          (Current).first   = (Current).last   =                        \
+            YYRHSLOC (Rhs, 0).last;                                     \
+        }                                                               \
+    while (0)
+
+} // code
+
+
+// -----------------------------------------------------------------------------
+// List of tokens
+// All tokens that we use (including "+" and such) are declared here first
+// using the %token line. Each token will end up a member of the "enum
+// yytokentype" in parser.tab.hh. Tokens can have a string equivalent (such as
+// "+" for TK_PLUS) that is used later in the file to simplify reading it, but
+// it is equivalent to TK_PLUS. Bison also allows so called "character token
+// type" which are specified using single quotes (and that bypass the %token
+// definitions), and those are not used here, and we test that the whole file
+// does not contain any single quotes to ensure that.
+//
+// If this list is updated, update also token2text() in parser.cpp.
+
+// Terminal tokens
+%token END_OF_FILE 0
+%token TK_NEWLINE
+%token TK_INDENT
+%token TK_DEDENT
+%token <string> TK_NAME
+%token <int_suffix> TK_INTEGER
+%token <string> TK_REAL
+%token <int_suffix> TK_IMAG_NUM
+%token TK_PLUS "+"
+%token TK_MINUS "-"
+%token TK_STAR "*"
+%token TK_SLASH "/"
+%token TK_COLON ":"
+%token TK_SEMICOLON ";"
+%token TK_COMMA ","
+%token TK_EQUAL "="
+%token TK_LPAREN "("
+%token TK_RPAREN ")"
+%token TK_LBRACKET "["
+%token TK_RBRACKET "]"
+%token TK_LBRACE "{"
+%token TK_RBRACE "}"
+%token TK_PERCENT "%"
+%token TK_VBAR "|"
+%token TK_AMPERSAND "&"
+%token TK_DOT "."
+%token TK_TILDE "~"
+%token TK_CARET "^"
+%token TK_AT "@"
+%token <string> TK_STRING
+%token <string> TK_COMMENT
+%token <string> TK_EOLCOMMENT
+%token TK_POW "**"
+%token TK_FLOOR_DIV "//"
+%token TK_RIGHTSHIFT ">>"
+%token TK_LEFTSHIFT "<<"
+%token TK_PLUS_EQUAL "+="
+%token TK_MIN_EQUAL "-="
+%token TK_STAR_EQUAL "*="
+%token TK_SLASH_EQUAL "/="
+%token TK_PERCENT_EQUAL "%="
+%token TK_AMPER_EQUAL "&="
+%token TK_VBAR_EQUAL "|="
+%token TK_CARET_EQUAL "^="
+%token TK_ATEQUAL "@="
+%token TK_RARROW "->"
+%token TK_COLONEQUAL ":="
+%token TK_ELLIPSIS "..."
+%token TK_LEFTSHIFT_EQUAL "<<="
+%token TK_RIGHTSHIFT_EQUAL ">>="
+%token TK_POW_EQUAL "**="
+%token TK_DOUBLESLASH_EQUAL "//="
+%token TK_EQ "=="
+%token TK_NE "!="
+%token TK_LT "<"
+%token TK_LE "<="
+%token TK_GT ">"
+%token TK_GE ">="
+%token TK_NOT "not"
+%token TK_AND "and"
+%token TK_OR "or"
+%token TK_TRUE "True"
+%token TK_FALSE "False"
+
+
+// Terminal tokens: Keywords
+%token KW_AS
+%token KW_ASSERT
+%token KW_ASYNC
+%token KW_AWAIT
+%token KW_BREAK
+%token KW_CLASS
+%token KW_CONTINUE
+%token KW_DEF
+%token KW_DEL
+%token KW_ELIF
+%token KW_ELSE
+%token KW_EXCEPT
+%token KW_FINALLY
+%token KW_FOR
+%token KW_FROM
+%token KW_GLOBAL
+%token KW_IF
+%token KW_IMPORT
+%token KW_IN
+%token KW_IS
+%token KW_LAMBDA
+%token KW_NONE
+%token KW_NONLOCAL
+%token KW_PASS
+%token KW_RAISE
+%token KW_RETURN
+%token KW_TRY
+%token KW_WHILE
+%token KW_WITH
+%token KW_YIELD
+
+// Nonterminal tokens
+
+%type <ast> expr
+
+// Precedence
+
+//%left "or"
+//%left "and"
+//%precedence "not"
+//%left "==" "/=" "<" "<=" ">" ">="
+%left "-" "+"
+%left "*" "/"
+//%precedence UMINUS
+%right "**"
+
+%start expr
+
+%%
+
+// The order of rules does not matter in Bison (unlike in ANTLR). The
+// precedence is specified not by the order but by %left and %right directives
+// as well as with %dprec.
+
+// ----------------------------------------------------------------------------
+// Top level rules to be used for parsing.
+
+// Higher %dprec means higher precedence
+
+expr
+// ### primary
+    : TK_NAME { $$ = SYMBOL($1, @$); }
+    | TK_INTEGER { $$ = INTEGER($1, @$); }
+    | "(" expr ")" { $$ = $2; }
+
+
+// ### level-2
+    | expr "+" expr { $$ = ADD($1, $3, @$); }
+    | expr "-" expr { $$ = SUB($1, $3, @$); }
+    | expr "*" expr { $$ = MUL($1, $3, @$); }
+    | expr "/" expr { $$ = DIV($1, $3, @$); }
+    | expr "**" expr { $$ = POW($1, $3, @$); }
+
+    ;