From 7bb7ec12cb30e702d4541a4046636d6fd867677f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Thu, 7 Apr 2022 11:35:00 -0600 Subject: [PATCH] Add an initial Bison parser --- build0.sh | 6 +- ci/build.xsh | 3 +- src/lpython/parser/parser.tab.hh | 106 --------------- src/lpython/parser/parser.yy | 216 +++++++++++++++++++++++++++++++ 4 files changed, 223 insertions(+), 108 deletions(-) delete mode 100644 src/lpython/parser/parser.tab.hh create mode 100644 src/lpython/parser/parser.yy diff --git a/build0.sh b/build0.sh index 2730c54671..aa7e560697 100755 --- a/build0.sh +++ b/build0.sh @@ -13,5 +13,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h # Generate a Fortran ASR from ASR.asdl (C++) python grammar/asdl_cpp.py grammar/ASR.asdl src/libasr/asr.h -# Generate the tokenizer +# Generate the tokenizer and parser (cd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp) +(cd src/lpython/parser && bison -Wall -d -r all parser.yy) + +grep -n "'" src/lpython/parser/parser.yy && echo "Single quote not allowed" && exit 1 +echo "OK" diff --git a/ci/build.xsh b/ci/build.xsh index e574948ada..ce00b9df7d 100755 --- a/ci/build.xsh +++ b/ci/build.xsh @@ -33,8 +33,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h # Generate a Python AST from Python.asdl (Python) python grammar/asdl_py.py -# Generate the tokenizer +# Generate the tokenizer and parser pushd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp && popd +pushd src/lpython/parser && bison -Wall -d -r all parser.yy && popd $lpython_version=$(cat version).strip() $dest="lpython-" + $lpython_version diff --git a/src/lpython/parser/parser.tab.hh b/src/lpython/parser/parser.tab.hh deleted file mode 100644 index 373410a0f5..0000000000 --- a/src/lpython/parser/parser.tab.hh +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef LPYTHON_PARSER_PARSER_TAB_H -#define LPYTHON_PARSER_PARSER_TAB_H - -/* - * This file contains manual list of tokens. These will be later - * autogenerated from the parser. - */ - - enum yytokentype - { - END_OF_FILE = 0, - TK_NEWLINE = 258, - TK_INDENT = 259, - TK_DEDENT = 260, - TK_NAME = 261, - TK_INTEGER = 262, - TK_REAL = 263, - TK_IMAG_NUM = 264, - TK_PLUS = 265, - TK_MINUS = 266, - TK_STAR = 267, - TK_SLASH = 268, - TK_COLON = 269, - TK_SEMICOLON = 270, - TK_COMMA = 271, - TK_EQUAL = 272, - TK_LPAREN = 273, - TK_RPAREN = 274, - TK_LBRACKET = 275, - TK_RBRACKET = 276, - TK_LBRACE = 277, - TK_RBRACE = 278, - TK_PERCENT = 279, - TK_VBAR = 280, - TK_AMPERSAND = 281, - TK_DOT = 282, - TK_TILDE = 283, - TK_CARET = 284, - TK_AT = 285, - TK_STRING = 286, - TK_COMMENT = 287, - TK_EOLCOMMENT = 288, - TK_POW = 289, - TK_FLOOR_DIV = 290, - TK_RIGHTSHIFT = 291, - TK_LEFTSHIFT = 292, - TK_PLUS_EQUAL = 293, - TK_MIN_EQUAL = 294, - TK_STAR_EQUAL = 295, - TK_SLASH_EQUAL = 296, - TK_PERCENT_EQUAL = 297, - TK_AMPER_EQUAL = 298, - TK_VBAR_EQUAL = 299, - TK_CARET_EQUAL = 300, - TK_ATEQUAL = 301, - TK_RARROW = 302, - TK_COLONEQUAL = 303, - TK_ELLIPSIS = 304, - TK_LEFTSHIFT_EQUAL = 305, - TK_RIGHTSHIFT_EQUAL = 306, - TK_POW_EQUAL = 307, - TK_DOUBLESLASH_EQUAL = 308, - TK_EQ = 309, - TK_NE = 310, - TK_LT = 311, - TK_LE = 312, - TK_GT = 313, - TK_GE = 314, - TK_NOT = 315, - TK_AND = 316, - TK_OR = 317, - TK_TRUE = 318, - TK_FALSE = 319, - KW_AS = 320, - KW_ASSERT = 321, - KW_ASYNC = 322, - KW_AWAIT = 323, - KW_BREAK = 324, - KW_CLASS = 325, - KW_CONTINUE = 326, - KW_DEF = 327, - KW_DEL = 328, - KW_ELIF = 329, - KW_ELSE = 330, - KW_EXCEPT = 331, - KW_FINALLY = 332, - KW_FOR = 333, - KW_FROM = 334, - KW_GLOBAL = 335, - KW_IF = 336, - KW_IMPORT = 337, - KW_IN = 338, - KW_IS = 339, - KW_LAMBDA = 340, - KW_NONE = 341, - KW_NONLOCAL = 342, - KW_PASS = 343, - KW_RAISE = 344, - KW_RETURN = 345, - KW_TRY = 346, - KW_WHILE = 347, - KW_WITH = 348, - KW_YIELD = 349 - }; - -#endif // LPYTHON_PARSER_PARSER_TAB_H diff --git a/src/lpython/parser/parser.yy b/src/lpython/parser/parser.yy new file mode 100644 index 0000000000..34f4dfddb5 --- /dev/null +++ b/src/lpython/parser/parser.yy @@ -0,0 +1,216 @@ +%require "3.0" +%define api.pure +%define api.value.type {LFortran::YYSTYPE} +//%param {LFortran::Parser &p} +%locations +%expect 0 // shift/reduce conflicts + +// Uncomment this to get verbose error messages +//%define parse.error verbose + +/* +// Uncomment this to enable parser tracing. Then in the main code, set +// extern int yydebug; +// yydebug=1; +%define parse.trace +%printer { fprintf(yyo, "%s", $$.str().c_str()); } +%printer { fprintf(yyo, "%d", $$); } +%printer { std::cerr << "AST TYPE: " << $$->type; } +*/ + + +%code requires // *.h +{ +//#include +} + +%code // *.cpp +{ + +#include +#include +#include + +/* +int yylex(LFortran::YYSTYPE *yylval, YYLTYPE *yyloc, LFortran::Parser &p) +{ + return p.m_tokenizer.lex(p.m_a, *yylval, *yyloc, p.diag); +} // ylex + +void yyerror(YYLTYPE *yyloc, LFortran::Parser &p, const std::string &msg) +{ + p.handle_yyerror(*yyloc, msg); +} +*/ + +#define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (N) \ + { \ + (Current).first = YYRHSLOC (Rhs, 1).first; \ + (Current).last = YYRHSLOC (Rhs, N).last; \ + } \ + else \ + { \ + (Current).first = (Current).last = \ + YYRHSLOC (Rhs, 0).last; \ + } \ + while (0) + +} // code + + +// ----------------------------------------------------------------------------- +// List of tokens +// All tokens that we use (including "+" and such) are declared here first +// using the %token line. Each token will end up a member of the "enum +// yytokentype" in parser.tab.hh. Tokens can have a string equivalent (such as +// "+" for TK_PLUS) that is used later in the file to simplify reading it, but +// it is equivalent to TK_PLUS. Bison also allows so called "character token +// type" which are specified using single quotes (and that bypass the %token +// definitions), and those are not used here, and we test that the whole file +// does not contain any single quotes to ensure that. +// +// If this list is updated, update also token2text() in parser.cpp. + +// Terminal tokens +%token END_OF_FILE 0 +%token TK_NEWLINE +%token TK_INDENT +%token TK_DEDENT +%token TK_NAME +%token TK_INTEGER +%token TK_REAL +%token TK_IMAG_NUM +%token TK_PLUS "+" +%token TK_MINUS "-" +%token TK_STAR "*" +%token TK_SLASH "/" +%token TK_COLON ":" +%token TK_SEMICOLON ";" +%token TK_COMMA "," +%token TK_EQUAL "=" +%token TK_LPAREN "(" +%token TK_RPAREN ")" +%token TK_LBRACKET "[" +%token TK_RBRACKET "]" +%token TK_LBRACE "{" +%token TK_RBRACE "}" +%token TK_PERCENT "%" +%token TK_VBAR "|" +%token TK_AMPERSAND "&" +%token TK_DOT "." +%token TK_TILDE "~" +%token TK_CARET "^" +%token TK_AT "@" +%token TK_STRING +%token TK_COMMENT +%token TK_EOLCOMMENT +%token TK_POW "**" +%token TK_FLOOR_DIV "//" +%token TK_RIGHTSHIFT ">>" +%token TK_LEFTSHIFT "<<" +%token TK_PLUS_EQUAL "+=" +%token TK_MIN_EQUAL "-=" +%token TK_STAR_EQUAL "*=" +%token TK_SLASH_EQUAL "/=" +%token TK_PERCENT_EQUAL "%=" +%token TK_AMPER_EQUAL "&=" +%token TK_VBAR_EQUAL "|=" +%token TK_CARET_EQUAL "^=" +%token TK_ATEQUAL "@=" +%token TK_RARROW "->" +%token TK_COLONEQUAL ":=" +%token TK_ELLIPSIS "..." +%token TK_LEFTSHIFT_EQUAL "<<=" +%token TK_RIGHTSHIFT_EQUAL ">>=" +%token TK_POW_EQUAL "**=" +%token TK_DOUBLESLASH_EQUAL "//=" +%token TK_EQ "==" +%token TK_NE "!=" +%token TK_LT "<" +%token TK_LE "<=" +%token TK_GT ">" +%token TK_GE ">=" +%token TK_NOT "not" +%token TK_AND "and" +%token TK_OR "or" +%token TK_TRUE "True" +%token TK_FALSE "False" + + +// Terminal tokens: Keywords +%token KW_AS +%token KW_ASSERT +%token KW_ASYNC +%token KW_AWAIT +%token KW_BREAK +%token KW_CLASS +%token KW_CONTINUE +%token KW_DEF +%token KW_DEL +%token KW_ELIF +%token KW_ELSE +%token KW_EXCEPT +%token KW_FINALLY +%token KW_FOR +%token KW_FROM +%token KW_GLOBAL +%token KW_IF +%token KW_IMPORT +%token KW_IN +%token KW_IS +%token KW_LAMBDA +%token KW_NONE +%token KW_NONLOCAL +%token KW_PASS +%token KW_RAISE +%token KW_RETURN +%token KW_TRY +%token KW_WHILE +%token KW_WITH +%token KW_YIELD + +// Nonterminal tokens + +%type expr + +// Precedence + +//%left "or" +//%left "and" +//%precedence "not" +//%left "==" "/=" "<" "<=" ">" ">=" +%left "-" "+" +%left "*" "/" +//%precedence UMINUS +%right "**" + +%start expr + +%% + +// The order of rules does not matter in Bison (unlike in ANTLR). The +// precedence is specified not by the order but by %left and %right directives +// as well as with %dprec. + +// ---------------------------------------------------------------------------- +// Top level rules to be used for parsing. + +// Higher %dprec means higher precedence + +expr +// ### primary + : TK_NAME { $$ = SYMBOL($1, @$); } + | TK_INTEGER { $$ = INTEGER($1, @$); } + | "(" expr ")" { $$ = $2; } + + +// ### level-2 + | expr "+" expr { $$ = ADD($1, $3, @$); } + | expr "-" expr { $$ = SUB($1, $3, @$); } + | expr "*" expr { $$ = MUL($1, $3, @$); } + | expr "/" expr { $$ = DIV($1, $3, @$); } + | expr "**" expr { $$ = POW($1, $3, @$); } + + ;