From 7bb7ec12cb30e702d4541a4046636d6fd867677f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= <ondrej@certik.us>
Date: Thu, 7 Apr 2022 11:35:00 -0600
Subject: [PATCH] Add an initial Bison parser

---
 build0.sh                        |   6 +-
 ci/build.xsh                     |   3 +-
 src/lpython/parser/parser.tab.hh | 106 ---------------
 src/lpython/parser/parser.yy     | 216 +++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+), 108 deletions(-)
 delete mode 100644 src/lpython/parser/parser.tab.hh
 create mode 100644 src/lpython/parser/parser.yy

diff --git a/build0.sh b/build0.sh
index 2730c54671..aa7e560697 100755
--- a/build0.sh
+++ b/build0.sh
@@ -13,5 +13,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
 # Generate a Fortran ASR from ASR.asdl (C++)
 python grammar/asdl_cpp.py grammar/ASR.asdl src/libasr/asr.h
 
-# Generate the tokenizer
+# Generate the tokenizer and parser
 (cd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)
+(cd src/lpython/parser && bison -Wall -d -r all parser.yy)
+
+grep -n "'" src/lpython/parser/parser.yy && echo "Single quote not allowed" && exit 1
+echo "OK"
diff --git a/ci/build.xsh b/ci/build.xsh
index e574948ada..ce00b9df7d 100755
--- a/ci/build.xsh
+++ b/ci/build.xsh
@@ -33,8 +33,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
 # Generate a Python AST from Python.asdl (Python)
 python grammar/asdl_py.py
 
-# Generate the tokenizer
+# Generate the tokenizer and parser
 pushd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp && popd
+pushd src/lpython/parser && bison -Wall -d -r all parser.yy && popd
 
 $lpython_version=$(cat version).strip()
 $dest="lpython-" + $lpython_version
diff --git a/src/lpython/parser/parser.tab.hh b/src/lpython/parser/parser.tab.hh
deleted file mode 100644
index 373410a0f5..0000000000
--- a/src/lpython/parser/parser.tab.hh
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef LPYTHON_PARSER_PARSER_TAB_H
-#define LPYTHON_PARSER_PARSER_TAB_H
-
-/*
- * This file contains manual list of tokens. These will be later
- * autogenerated from the parser.
- */
-
-  enum yytokentype
-  {
-    END_OF_FILE = 0,
-    TK_NEWLINE = 258,
-    TK_INDENT = 259,
-    TK_DEDENT = 260,
-    TK_NAME = 261,
-    TK_INTEGER = 262,
-    TK_REAL = 263,
-    TK_IMAG_NUM = 264,
-    TK_PLUS = 265,
-    TK_MINUS = 266,
-    TK_STAR = 267,
-    TK_SLASH = 268,
-    TK_COLON = 269,
-    TK_SEMICOLON = 270,
-    TK_COMMA = 271,
-    TK_EQUAL = 272,
-    TK_LPAREN = 273,
-    TK_RPAREN = 274,
-    TK_LBRACKET = 275,
-    TK_RBRACKET = 276,
-    TK_LBRACE = 277,
-    TK_RBRACE = 278,
-    TK_PERCENT = 279,
-    TK_VBAR = 280,
-    TK_AMPERSAND = 281,
-    TK_DOT = 282,
-    TK_TILDE = 283,
-    TK_CARET = 284,
-    TK_AT = 285,
-    TK_STRING = 286,
-    TK_COMMENT = 287,
-    TK_EOLCOMMENT = 288,
-    TK_POW = 289,
-    TK_FLOOR_DIV = 290,
-    TK_RIGHTSHIFT = 291,
-    TK_LEFTSHIFT = 292,
-    TK_PLUS_EQUAL = 293,
-    TK_MIN_EQUAL = 294,
-    TK_STAR_EQUAL = 295,
-    TK_SLASH_EQUAL = 296,
-    TK_PERCENT_EQUAL = 297,
-    TK_AMPER_EQUAL = 298,
-    TK_VBAR_EQUAL = 299,
-    TK_CARET_EQUAL = 300,
-    TK_ATEQUAL = 301,
-    TK_RARROW = 302,
-    TK_COLONEQUAL = 303,
-    TK_ELLIPSIS = 304,
-    TK_LEFTSHIFT_EQUAL = 305,
-    TK_RIGHTSHIFT_EQUAL = 306,
-    TK_POW_EQUAL = 307,
-    TK_DOUBLESLASH_EQUAL = 308,
-    TK_EQ = 309,
-    TK_NE = 310,
-    TK_LT = 311,
-    TK_LE = 312,
-    TK_GT = 313,
-    TK_GE = 314,
-    TK_NOT = 315,
-    TK_AND = 316,
-    TK_OR = 317,
-    TK_TRUE = 318,
-    TK_FALSE = 319,
-    KW_AS = 320,
-    KW_ASSERT = 321,
-    KW_ASYNC = 322,
-    KW_AWAIT = 323,
-    KW_BREAK = 324,
-    KW_CLASS = 325,
-    KW_CONTINUE = 326,
-    KW_DEF = 327,
-    KW_DEL = 328,
-    KW_ELIF = 329,
-    KW_ELSE = 330,
-    KW_EXCEPT = 331,
-    KW_FINALLY = 332,
-    KW_FOR = 333,
-    KW_FROM = 334,
-    KW_GLOBAL = 335,
-    KW_IF = 336,
-    KW_IMPORT = 337,
-    KW_IN = 338,
-    KW_IS = 339,
-    KW_LAMBDA = 340,
-    KW_NONE = 341,
-    KW_NONLOCAL = 342,
-    KW_PASS = 343,
-    KW_RAISE = 344,
-    KW_RETURN = 345,
-    KW_TRY = 346,
-    KW_WHILE = 347,
-    KW_WITH = 348,
-    KW_YIELD = 349
-  };
-
-#endif // LPYTHON_PARSER_PARSER_TAB_H
diff --git a/src/lpython/parser/parser.yy b/src/lpython/parser/parser.yy
new file mode 100644
index 0000000000..34f4dfddb5
--- /dev/null
+++ b/src/lpython/parser/parser.yy
@@ -0,0 +1,216 @@
+%require "3.0"
+%define api.pure
+%define api.value.type {LFortran::YYSTYPE}
+//%param {LFortran::Parser &p}
+%locations
+%expect    0   // shift/reduce conflicts
+
+// Uncomment this to get verbose error messages
+//%define parse.error verbose
+
+/*
+// Uncomment this to enable parser tracing. Then in the main code, set
+// extern int yydebug;
+// yydebug=1;
+%define parse.trace
+%printer { fprintf(yyo, "%s", $$.str().c_str()); } <string>
+%printer { fprintf(yyo, "%d", $$); } <n>
+%printer { std::cerr << "AST TYPE: " << $$->type; } <ast>
+*/
+
+
+%code requires // *.h
+{
+//#include <lpython/parser/parser.h>
+}
+
+%code // *.cpp
+{
+
+#include <lpython/parser/parser.h>
+#include <lpython/parser/tokenizer.h>
+#include <lpython/parser/semantics.h>
+
+/*
+int yylex(LFortran::YYSTYPE *yylval, YYLTYPE *yyloc, LFortran::Parser &p)
+{
+    return p.m_tokenizer.lex(p.m_a, *yylval, *yyloc, p.diag);
+} // ylex
+
+void yyerror(YYLTYPE *yyloc, LFortran::Parser &p, const std::string &msg)
+{
+    p.handle_yyerror(*yyloc, msg);
+}
+*/
+
+#define YYLLOC_DEFAULT(Current, Rhs, N)                                 \
+    do                                                                  \
+      if (N)                                                            \
+        {                                                               \
+          (Current).first   = YYRHSLOC (Rhs, 1).first;                  \
+          (Current).last    = YYRHSLOC (Rhs, N).last;                   \
+        }                                                               \
+      else                                                              \
+        {                                                               \
+          (Current).first   = (Current).last   =                        \
+            YYRHSLOC (Rhs, 0).last;                                     \
+        }                                                               \
+    while (0)
+
+} // code
+
+
+// -----------------------------------------------------------------------------
+// List of tokens
+// All tokens that we use (including "+" and such) are declared here first
+// using the %token line. Each token will end up a member of the "enum
+// yytokentype" in parser.tab.hh. Tokens can have a string equivalent (such as
+// "+" for TK_PLUS) that is used later in the file to simplify reading it, but
+// it is equivalent to TK_PLUS. Bison also allows so called "character token
+// type" which are specified using single quotes (and that bypass the %token
+// definitions), and those are not used here, and we test that the whole file
+// does not contain any single quotes to ensure that.
+//
+// If this list is updated, update also token2text() in parser.cpp.
+
+// Terminal tokens
+%token END_OF_FILE 0
+%token TK_NEWLINE
+%token TK_INDENT
+%token TK_DEDENT
+%token <string> TK_NAME
+%token <int_suffix> TK_INTEGER
+%token <string> TK_REAL
+%token <int_suffix> TK_IMAG_NUM
+%token TK_PLUS "+"
+%token TK_MINUS "-"
+%token TK_STAR "*"
+%token TK_SLASH "/"
+%token TK_COLON ":"
+%token TK_SEMICOLON ";"
+%token TK_COMMA ","
+%token TK_EQUAL "="
+%token TK_LPAREN "("
+%token TK_RPAREN ")"
+%token TK_LBRACKET "["
+%token TK_RBRACKET "]"
+%token TK_LBRACE "{"
+%token TK_RBRACE "}"
+%token TK_PERCENT "%"
+%token TK_VBAR "|"
+%token TK_AMPERSAND "&"
+%token TK_DOT "."
+%token TK_TILDE "~"
+%token TK_CARET "^"
+%token TK_AT "@"
+%token <string> TK_STRING
+%token <string> TK_COMMENT
+%token <string> TK_EOLCOMMENT
+%token TK_POW "**"
+%token TK_FLOOR_DIV "//"
+%token TK_RIGHTSHIFT ">>"
+%token TK_LEFTSHIFT "<<"
+%token TK_PLUS_EQUAL "+="
+%token TK_MIN_EQUAL "-="
+%token TK_STAR_EQUAL "*="
+%token TK_SLASH_EQUAL "/="
+%token TK_PERCENT_EQUAL "%="
+%token TK_AMPER_EQUAL "&="
+%token TK_VBAR_EQUAL "|="
+%token TK_CARET_EQUAL "^="
+%token TK_ATEQUAL "@="
+%token TK_RARROW "->"
+%token TK_COLONEQUAL ":="
+%token TK_ELLIPSIS "..."
+%token TK_LEFTSHIFT_EQUAL "<<="
+%token TK_RIGHTSHIFT_EQUAL ">>="
+%token TK_POW_EQUAL "**="
+%token TK_DOUBLESLASH_EQUAL "//="
+%token TK_EQ "=="
+%token TK_NE "!="
+%token TK_LT "<"
+%token TK_LE "<="
+%token TK_GT ">"
+%token TK_GE ">="
+%token TK_NOT "not"
+%token TK_AND "and"
+%token TK_OR "or"
+%token TK_TRUE "True"
+%token TK_FALSE "False"
+
+
+// Terminal tokens: Keywords
+%token KW_AS
+%token KW_ASSERT
+%token KW_ASYNC
+%token KW_AWAIT
+%token KW_BREAK
+%token KW_CLASS
+%token KW_CONTINUE
+%token KW_DEF
+%token KW_DEL
+%token KW_ELIF
+%token KW_ELSE
+%token KW_EXCEPT
+%token KW_FINALLY
+%token KW_FOR
+%token KW_FROM
+%token KW_GLOBAL
+%token KW_IF
+%token KW_IMPORT
+%token KW_IN
+%token KW_IS
+%token KW_LAMBDA
+%token KW_NONE
+%token KW_NONLOCAL
+%token KW_PASS
+%token KW_RAISE
+%token KW_RETURN
+%token KW_TRY
+%token KW_WHILE
+%token KW_WITH
+%token KW_YIELD
+
+// Nonterminal tokens
+
+%type <ast> expr
+
+// Precedence
+
+//%left "or"
+//%left "and"
+//%precedence "not"
+//%left "==" "/=" "<" "<=" ">" ">="
+%left "-" "+"
+%left "*" "/"
+//%precedence UMINUS
+%right "**"
+
+%start expr
+
+%%
+
+// The order of rules does not matter in Bison (unlike in ANTLR). The
+// precedence is specified not by the order but by %left and %right directives
+// as well as with %dprec.
+
+// ----------------------------------------------------------------------------
+// Top level rules to be used for parsing.
+
+// Higher %dprec means higher precedence
+
+expr
+// ### primary
+    : TK_NAME { $$ = SYMBOL($1, @$); }
+    | TK_INTEGER { $$ = INTEGER($1, @$); }
+    | "(" expr ")" { $$ = $2; }
+
+
+// ### level-2
+    | expr "+" expr { $$ = ADD($1, $3, @$); }
+    | expr "-" expr { $$ = SUB($1, $3, @$); }
+    | expr "*" expr { $$ = MUL($1, $3, @$); }
+    | expr "/" expr { $$ = DIV($1, $3, @$); }
+    | expr "**" expr { $$ = POW($1, $3, @$); }
+
+    ;