diff --git a/CMakeLists.txt b/CMakeLists.txt index beaeb8576..c4283104f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,25 +82,41 @@ ac_check_headers("unistd.h") # windows POSIX-like API ac_check_headers("io.h") +# supported languages +set(re2c_langs "c" "d" "go" "haskell" "java" "js" "ocaml" "python" "rust" "v" "zig") + # docs (manpages and help) -set(re2c_manpage_source "${CMAKE_CURRENT_BINARY_DIR}/doc/manpage.rst") -set(re2c_help_source "${CMAKE_CURRENT_BINARY_DIR}/doc/help.rst") -set(re2c_manpage_bootstrap_c "${CMAKE_CURRENT_SOURCE_DIR}/bootstrap/doc/re2c.1") -set(re2c_manpage_bootstrap_go "${CMAKE_CURRENT_SOURCE_DIR}/bootstrap/doc/re2go.1") -set(re2c_manpage_bootstrap_rust "${CMAKE_CURRENT_SOURCE_DIR}/bootstrap/doc/re2rust.1") -set(re2c_help_bootstrap "${CMAKE_CURRENT_SOURCE_DIR}/bootstrap/src/msg/help.cc") -set(re2c_manpage_c "${CMAKE_CURRENT_BINARY_DIR}/doc/re2c.1") -set(re2c_manpage_go "${CMAKE_CURRENT_BINARY_DIR}/doc/re2go.1") -set(re2c_manpage_rust "${CMAKE_CURRENT_BINARY_DIR}/doc/re2rust.1") -set(re2c_help "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help.cc") -set(re2c_rst2man "${CMAKE_CURRENT_SOURCE_DIR}/build/rst2man.py") -set(re2c_rst2txt "${CMAKE_CURRENT_SOURCE_DIR}/build/rst2txt.py") -set(re2c_splitman "${CMAKE_CURRENT_SOURCE_DIR}/build/split_man.py") +set(re2c_manpage_source "${CMAKE_CURRENT_BINARY_DIR}/doc/manpage.rst") +set(re2c_help_source "${CMAKE_CURRENT_BINARY_DIR}/doc/help.rst") +set(re2c_help_bootstrap "${CMAKE_CURRENT_SOURCE_DIR}/bootstrap/src/msg/help.cc") +set(re2c_manpage_c "${CMAKE_CURRENT_BINARY_DIR}/doc/re2c.1") +set(re2c_manpage_d "${CMAKE_CURRENT_BINARY_DIR}/doc/re2d.1") +set(re2c_manpage_go "${CMAKE_CURRENT_BINARY_DIR}/doc/re2go.1") +set(re2c_manpage_haskell "${CMAKE_CURRENT_BINARY_DIR}/doc/re2hs.1") +set(re2c_manpage_java "${CMAKE_CURRENT_BINARY_DIR}/doc/re2java.1") +set(re2c_manpage_js "${CMAKE_CURRENT_BINARY_DIR}/doc/re2js.1") +set(re2c_manpage_ocaml "${CMAKE_CURRENT_BINARY_DIR}/doc/re2ocaml.1") +set(re2c_manpage_python "${CMAKE_CURRENT_BINARY_DIR}/doc/re2python.1") +set(re2c_manpage_rust "${CMAKE_CURRENT_BINARY_DIR}/doc/re2rust.1") +set(re2c_manpage_v "${CMAKE_CURRENT_BINARY_DIR}/doc/re2v.1") +set(re2c_manpage_zig "${CMAKE_CURRENT_BINARY_DIR}/doc/re2zig.1") +set(re2c_help "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help.cc") +set(re2c_rst2man "${CMAKE_CURRENT_SOURCE_DIR}/build/rst2man.py") +set(re2c_rst2txt "${CMAKE_CURRENT_SOURCE_DIR}/build/rst2txt.py") +set(re2c_splitman "${CMAKE_CURRENT_SOURCE_DIR}/build/split_man.py") set(re2c_docs "${re2c_help}" "${re2c_manpage_c}" + "$<$:${re2c_manpage_d}>" "$<$:${re2c_manpage_go}>" + "$<$:${re2c_manpage_haskell}>" + "$<$:${re2c_manpage_java}>" + "$<$:${re2c_manpage_js}>" + "$<$:${re2c_manpage_ocaml}>" + "$<$:${re2c_manpage_python}>" "$<$:${re2c_manpage_rust}>" + "$<$:${re2c_manpage_v}>" + "$<$:${re2c_manpage_zig}>" ) # syntax files @@ -233,6 +249,28 @@ re2c_bootstrap_lexer("src/parse/conf_lexer.re" "src/parse/conf_lexer.cc") re2c_bootstrap_parser("src/parse/conf_parser.ypp" "src/parse/conf_parser.cc" "src/parse/conf_parser.h") +# docs +file(GLOB_RECURSE re2c_docs_sources CONFIGURE_DEPENDS + "examples/*" + "doc/manual/*" + "${re2c_manpage_source}" +) + +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_c}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_d}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_go}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_haskell}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_java}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_js}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_ocaml}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_python}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_rust}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_v}") +re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_zig}") + +re2c_gen_help("${re2c_help_source}" "${re2c_help}" "${re2c_help_bootstrap}") +add_custom_target(docs DEPENDS "${re2c_docs}") + re2c_bootstrap_syntax("include/syntax/c" "src/default_syntax_c.cc") re2c_bootstrap_syntax("include/syntax/d" "src/default_syntax_d.cc") re2c_bootstrap_syntax("include/syntax/go" "src/default_syntax_go.cc") @@ -338,123 +376,6 @@ if (RE2C_BUILD_RE2ZIG) ) endif() -# docs -set(re2c_docs_sources - "${re2c_manpage_source}" - "doc/manual/api/api1.rst_" - "doc/manual/api/api2_c.rst_" - "doc/manual/api/api2_go.rst_" - "doc/manual/api/api2_rust.rst_" - "doc/manual/api/api3.rst_" - "doc/manual/conditions/blocks.rst_" - "doc/manual/conditions/conditions.rst_" - "doc/manual/configurations/configurations.rst_" - "doc/manual/directives/directives.rst_" - "doc/manual/dot/dot.rst_" - "doc/manual/encodings/encodings.rst_" - "doc/manual/eof/01_sentinel.rst_" - "doc/manual/eof/02_bounds_checking.rst_" - "doc/manual/eof/03_eof_rule.rst_" - "doc/manual/eof/04_fake_sentinel.rst_" - "doc/manual/eof/eof.rst_" - "doc/manual/fill/01_fill.rst_" - "doc/manual/fill/02_fill.rst_" - "doc/manual/fill/fill.rst_" - "doc/manual/headers/headers.rst_" - "doc/manual/includes/includes.rst_" - "doc/manual/options/debug.rst_" - "doc/manual/options/internal.rst_" - "doc/manual/options/options.rst_" - "doc/manual/regexps/regular_expressions.rst_" - "doc/manual/reuse/reuse.rst_" - "doc/manual/skeleton/skeleton.rst_" - "doc/manual/state/state.rst_" - "doc/manual/submatch/submatch_example_mtags.rst_" - "doc/manual/submatch/submatch_example_captures.rst_" - "doc/manual/submatch/submatch_example_stags_fill.rst_" - "doc/manual/submatch/submatch_example_stags.rst_" - "doc/manual/submatch/submatch.rst_" - "doc/manual/synopsis.rst_" - "doc/manual/syntax/intro.rst_" - "doc/manual/syntax/syntax.rst_" - "doc/manual/warnings/warnings_general.rst_" - "doc/manual/warnings/warnings_list.rst_" - "examples/c/01_basic.re" - "examples/c/01_basic.c" - "examples/c/conditions/parse_u32_blocks.re" - "examples/c/conditions/parse_u32_conditions.re" - "examples/c/encodings/unicode_identifier.re" - "examples/c/eof/01_sentinel.re" - "examples/c/eof/02_bounds_checking.re" - "examples/c/eof/03_eof_rule.re" - "examples/c/eof/04_fake_sentinel.re" - "examples/c/fill/01_fill.re" - "examples/c/fill/02_fill.re" - "examples/c/headers/header.re" - "examples/c/headers/lexer/state.h" - "examples/c/includes/include.re" - "examples/c/includes/definitions.h" - "examples/c/reuse/reuse.re" - "examples/c/reuse/usedir.re" - "examples/c/state/push.re" - "examples/c/submatch/01_stags_fill.re" - "examples/c/submatch/01_stags.re" - "examples/c/submatch/02_mtags.re" - "examples/c/submatch/03_captures.re" - "examples/c/submatch/04_posix_captures.re" - "examples/go/01_basic.re" - "examples/go/01_basic.go" - "examples/go/conditions/parse_u32_blocks.re" - "examples/go/conditions/parse_u32_conditions.re" - "examples/go/encodings/unicode_identifier.re" - "examples/go/eof/01_sentinel.re" - "examples/go/eof/02_bounds_checking.re" - "examples/go/eof/03_eof_rule.re" - "examples/go/eof/04_fake_sentinel.re" - "examples/go/fill/01_fill.re" - "examples/go/fill/02_fill.re" - "examples/go/headers/header.re" - "examples/go/headers/lexer/state.go" - "examples/go/includes/include.re" - "examples/go/includes/definitions.go" - "examples/go/reuse/reuse.re" - "examples/go/reuse/usedir.re" - "examples/go/state/push.re" - "examples/go/submatch/01_stags_fill.re" - "examples/go/submatch/01_stags.re" - "examples/go/submatch/02_mtags.re" - "examples/go/submatch/03_captures.re" - "examples/go/submatch/04_posix_captures.re" - "examples/rust/01_basic.re" - "examples/rust/01_basic.rs" - "examples/rust/conditions/parse_u32_blocks.re" - "examples/rust/conditions/parse_u32_conditions.re" - "examples/rust/encodings/unicode_identifier.re" - "examples/rust/eof/01_sentinel.re" - "examples/rust/eof/02_bounds_checking.re" - "examples/rust/eof/03_eof_rule.re" - "examples/rust/eof/04_fake_sentinel.re" - "examples/rust/fill/01_fill.re" - "examples/rust/fill/02_fill.re" - "examples/rust/headers/header.re" - "examples/rust/headers/lexer/state.rs" - "examples/rust/includes/include.re" - "examples/rust/includes/definitions.rs" - "examples/rust/reuse/reuse.re" - "examples/rust/reuse/usedir.re" - "examples/rust/state/push.re" - "examples/rust/submatch/01_stags_fill.re" - "examples/rust/submatch/01_stags.re" - "examples/rust/submatch/02_mtags.re" - "examples/rust/submatch/03_captures.re" - "examples/rust/submatch/04_posix_captures.re" -) -re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_c}" "${re2c_manpage_bootstrap_c}" "c") -re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_go}" "${re2c_manpage_bootstrap_go}" "go") -re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_rust}" "${re2c_manpage_bootstrap_rust}" "rust") -re2c_gen_help("${re2c_help_source}" "${re2c_help}" "${re2c_help_bootstrap}") -add_custom_target(docs DEPENDS "${re2c_docs}") - # install targets are enabled only if re2c is the root project if(RE2C_IS_ROOT_PROJECT) # install @@ -462,6 +383,7 @@ if(RE2C_IS_ROOT_PROJECT) install(FILES "${re2c_manpage_c}" DESTINATION "share/man/man1") if(RE2C_BUILD_RE2D) install(TARGETS re2d RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_d}" DESTINATION "share/man/man1") endif() if(RE2C_BUILD_RE2GO) install(TARGETS re2go RUNTIME DESTINATION bin) @@ -469,12 +391,23 @@ if(RE2C_IS_ROOT_PROJECT) endif() if(RE2C_BUILD_RE2HS) install(TARGETS re2hs RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_haskell}" DESTINATION "share/man/man1") + endif() + if(RE2C_BUILD_RE2JAVA) + install(TARGETS re2java RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_java}" DESTINATION "share/man/man1") + endif() + if(RE2C_BUILD_RE2JS) + install(TARGETS re2js RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_js}" DESTINATION "share/man/man1") endif() if(RE2C_BUILD_RE2OCAML) install(TARGETS re2ocaml RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_ocaml}" DESTINATION "share/man/man1") endif() if(RE2C_BUILD_RE2PY) install(TARGETS re2py RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_python}" DESTINATION "share/man/man1") endif() if(RE2C_BUILD_RE2RUST) install(TARGETS re2rust RUNTIME DESTINATION bin) @@ -482,9 +415,11 @@ if(RE2C_IS_ROOT_PROJECT) endif() if(RE2C_BUILD_RE2V) install(TARGETS re2v RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_v}" DESTINATION "share/man/man1") endif() if(RE2C_BUILD_RE2ZIG) install(TARGETS re2zig RUNTIME DESTINATION bin) + install(FILES "${re2c_manpage_zig}" DESTINATION "share/man/man1") endif() install(FILES include/syntax/c diff --git a/Makefile.am b/Makefile.am index d7d5215ff..b76179714 100644 --- a/Makefile.am +++ b/Makefile.am @@ -164,14 +164,19 @@ nodist_re2c_SOURCES = $(re2c_GEN) BUILT_SOURCES = $(re2c_GEN_SRC) # bootstrap sources -re2c_BOOT_DOC_C = bootstrap/doc/re2c.1 -re2c_BOOT_DOC_GO = bootstrap/doc/re2go.1 -re2c_BOOT_DOC_RUST = bootstrap/doc/re2rust.1 re2c_BOOT_HELP = bootstrap/src/msg/help.cc re2c_BOOT = \ - $(re2c_BOOT_DOC_C) \ - $(re2c_BOOT_DOC_GO) \ - $(re2c_BOOT_DOC_RUST) \ + bootstrap/doc/re2c.1 \ + bootstrap/doc/re2d.1 \ + bootstrap/doc/re2go.1 \ + bootstrap/doc/re2hs.1 \ + bootstrap/doc/re2java.1 \ + bootstrap/doc/re2js.1 \ + bootstrap/doc/re2ocaml.1 \ + bootstrap/doc/re2py.1 \ + bootstrap/doc/re2rust.1 \ + bootstrap/doc/re2v.1 \ + bootstrap/doc/re2zig.1 \ $(re2c_BOOT_HELP) \ bootstrap/src/msg/ver_to_vernum.cc \ bootstrap/src/options/parse_opts.cc \ @@ -207,6 +212,8 @@ re2c_CUSTOM = \ # docs re2c_SRC_DOC = doc/manpage.rst +# To build the list of examples for all backends, run: +# find examples -regex '.*\.re\|.*\(state\|01_basic\|definitions\)\..*' -printf '\t%p \\\n' | sort re2c_SRC_DOC_EXT = \ doc/manual/api/api1.rst_ \ doc/manual/api/api2_c.rst_ \ @@ -246,8 +253,8 @@ re2c_SRC_DOC_EXT = \ doc/manual/syntax/syntax.rst_ \ doc/manual/warnings/warnings_general.rst_ \ doc/manual/warnings/warnings_list.rst_ \ - examples/c/01_basic.re \ examples/c/01_basic.c \ + examples/c/01_basic.re \ examples/c/conditions/parse_u32_blocks.re \ examples/c/conditions/parse_u32_conditions.re \ examples/c/encodings/unicode_identifier.re \ @@ -255,12 +262,16 @@ re2c_SRC_DOC_EXT = \ examples/c/eof/02_bounds_checking.re \ examples/c/eof/03_eof_rule.re \ examples/c/eof/04_fake_sentinel.re \ + examples/c/eof/05_fake_sentinel_eof_rule.re \ examples/c/fill/01_fill.re \ examples/c/fill/02_fill.re \ + examples/c/generic_api/ifstream.re \ examples/c/headers/header.re \ examples/c/headers/lexer/state.h \ - examples/c/includes/include.re \ examples/c/includes/definitions.h \ + examples/c/includes/include.re \ + examples/c/real_world/cxx98.re \ + examples/c/reuse/braille.re \ examples/c/reuse/reuse.re \ examples/c/reuse/usedir.re \ examples/c/state/push.re \ @@ -269,8 +280,36 @@ re2c_SRC_DOC_EXT = \ examples/c/submatch/02_mtags.re \ examples/c/submatch/03_captures.re \ examples/c/submatch/04_posix_captures.re \ - examples/go/01_basic.re \ + examples/c/submatch/http_rfc7230.re \ + examples/c/submatch/parse_etc_passwd.re \ + examples/c/submatch/parse_options.re \ + examples/c/submatch/parse_records.re \ + examples/c/submatch/uri_rfc3986.re \ + examples/d/01_basic.d \ + examples/d/01_basic.re \ + examples/d/conditions/parse_u32_blocks.re \ + examples/d/conditions/parse_u32_conditions.re \ + examples/d/encodings/unicode_identifier.re \ + examples/d/eof/01_sentinel.re \ + examples/d/eof/02_bounds_checking.re \ + examples/d/eof/03_eof_rule.re \ + examples/d/eof/04_fake_sentinel.re \ + examples/d/fill/01_fill.re \ + examples/d/fill/02_fill.re \ + examples/d/headers/header.re \ + examples/d/headers/lexer/state.d \ + examples/d/includes/definitions.d \ + examples/d/includes/include.re \ + examples/d/reuse/reuse.re \ + examples/d/reuse/usedir.re \ + examples/d/state/push.re \ + examples/d/submatch/01_stags_fill.re \ + examples/d/submatch/01_stags.re \ + examples/d/submatch/02_mtags.re \ + examples/d/submatch/03_captures.re \ + examples/d/submatch/04_posix_captures.re \ examples/go/01_basic.go \ + examples/go/01_basic.re \ examples/go/conditions/parse_u32_blocks.re \ examples/go/conditions/parse_u32_conditions.re \ examples/go/encodings/unicode_identifier.re \ @@ -282,8 +321,8 @@ re2c_SRC_DOC_EXT = \ examples/go/fill/02_fill.re \ examples/go/headers/header.re \ examples/go/headers/lexer/state.go \ - examples/go/includes/include.re \ examples/go/includes/definitions.go \ + examples/go/includes/include.re \ examples/go/reuse/reuse.re \ examples/go/reuse/usedir.re \ examples/go/state/push.re \ @@ -292,6 +331,120 @@ re2c_SRC_DOC_EXT = \ examples/go/submatch/02_mtags.re \ examples/go/submatch/03_captures.re \ examples/go/submatch/04_posix_captures.re \ + examples/haskell/01_basic.hs \ + examples/haskell/01_basic.re \ + examples/haskell/conditions/parse_u32_blocks.re \ + examples/haskell/conditions/parse_u32_conditions.re \ + examples/haskell/encodings/unicode_identifier.re \ + examples/haskell/eof/01_sentinel.re \ + examples/haskell/eof/02_bounds_checking.re \ + examples/haskell/eof/03_eof_rule.re \ + examples/haskell/eof/04_fake_sentinel.re \ + examples/haskell/fill/01_fill.re \ + examples/haskell/fill/02_fill.re \ + examples/haskell/headers/header.re \ + examples/haskell/headers/lexer/state.hs \ + examples/haskell/includes/definitions.hs \ + examples/haskell/includes/include.re \ + examples/haskell/reuse/reuse.re \ + examples/haskell/reuse/usedir.re \ + examples/haskell/state/push.re \ + examples/haskell/submatch/01_stags_fill.re \ + examples/haskell/submatch/01_stags.re \ + examples/haskell/submatch/02_mtags.re \ + examples/haskell/submatch/03_captures.re \ + examples/java/01_basic.java \ + examples/java/01_basic.re \ + examples/java/conditions/parse_u32_blocks.re \ + examples/java/conditions/parse_u32_conditions.re \ + examples/java/encodings/unicode_identifier.re \ + examples/java/eof/01_sentinel.re \ + examples/java/eof/02_bounds_checking.re \ + examples/java/eof/03_eof_rule.re \ + examples/java/eof/04_fake_sentinel.re \ + examples/java/fill/01_fill.re \ + examples/java/fill/02_fill.re \ + examples/java/headers/header.re \ + examples/java/headers/lexer/state.java \ + examples/java/includes/definitions.java \ + examples/java/includes/include.re \ + examples/java/reuse/reuse.re \ + examples/java/reuse/usedir.re \ + examples/java/state/push.re \ + examples/java/submatch/01_stags_fill.re \ + examples/java/submatch/01_stags.re \ + examples/java/submatch/02_mtags.re \ + examples/java/submatch/03_captures.re \ + examples/java/submatch/04_posix_captures.re \ + examples/js/01_basic.js \ + examples/js/01_basic.re \ + examples/js/conditions/parse_u32_blocks.re \ + examples/js/conditions/parse_u32_conditions.re \ + examples/js/encodings/unicode_identifier.re \ + examples/js/eof/01_sentinel.re \ + examples/js/eof/02_bounds_checking.re \ + examples/js/eof/03_eof_rule.re \ + examples/js/eof/04_fake_sentinel.re \ + examples/js/fill/01_fill.re \ + examples/js/fill/02_fill.re \ + examples/js/headers/header.re \ + examples/js/headers/lexer/state.js \ + examples/js/includes/definitions.js \ + examples/js/includes/include.re \ + examples/js/reuse/reuse.re \ + examples/js/reuse/usedir.re \ + examples/js/state/push.re \ + examples/js/submatch/01_stags_fill.re \ + examples/js/submatch/01_stags.re \ + examples/js/submatch/02_mtags.re \ + examples/js/submatch/03_captures.re \ + examples/js/submatch/04_posix_captures.re \ + examples/ocaml/01_basic.ml \ + examples/ocaml/01_basic.re \ + examples/ocaml/conditions/parse_u32_blocks.re \ + examples/ocaml/conditions/parse_u32_conditions.re \ + examples/ocaml/encodings/unicode_identifier.re \ + examples/ocaml/eof/01_sentinel.re \ + examples/ocaml/eof/02_bounds_checking.re \ + examples/ocaml/eof/03_eof_rule.re \ + examples/ocaml/eof/04_fake_sentinel.re \ + examples/ocaml/fill/01_fill.re \ + examples/ocaml/fill/02_fill.re \ + examples/ocaml/headers/header.re \ + examples/ocaml/headers/lexer/state.ml \ + examples/ocaml/includes/definitions.ml \ + examples/ocaml/includes/include.re \ + examples/ocaml/reuse/reuse.re \ + examples/ocaml/reuse/usedir.re \ + examples/ocaml/state/push.re \ + examples/ocaml/submatch/01_stags_fill.re \ + examples/ocaml/submatch/01_stags.re \ + examples/ocaml/submatch/02_mtags.re \ + examples/ocaml/submatch/03_captures.re \ + examples/ocaml/submatch/04_posix_captures.re \ + examples/python/01_basic.py \ + examples/python/01_basic.re \ + examples/python/conditions/parse_u32_blocks.re \ + examples/python/conditions/parse_u32_conditions.re \ + examples/python/encodings/unicode_identifier.re \ + examples/python/eof/01_sentinel.re \ + examples/python/eof/02_bounds_checking.re \ + examples/python/eof/03_eof_rule.re \ + examples/python/eof/04_fake_sentinel.re \ + examples/python/fill/01_fill.re \ + examples/python/fill/02_fill.re \ + examples/python/headers/header.re \ + examples/python/headers/lexer/state.py \ + examples/python/includes/definitions.py \ + examples/python/includes/include.re \ + examples/python/reuse/reuse.re \ + examples/python/reuse/usedir.re \ + examples/python/state/push.re \ + examples/python/submatch/01_stags_fill.re \ + examples/python/submatch/01_stags.re \ + examples/python/submatch/02_mtags.re \ + examples/python/submatch/03_captures.re \ + examples/python/submatch/04_posix_captures.re \ examples/rust/01_basic.re \ examples/rust/01_basic.rs \ examples/rust/conditions/parse_u32_blocks.re \ @@ -305,8 +458,9 @@ re2c_SRC_DOC_EXT = \ examples/rust/fill/02_fill.re \ examples/rust/headers/header.re \ examples/rust/headers/lexer/state.rs \ - examples/rust/includes/include.re \ examples/rust/includes/definitions.rs \ + examples/rust/includes/include.re \ + examples/rust/real_world/c.re \ examples/rust/reuse/reuse.re \ examples/rust/reuse/usedir.re \ examples/rust/state/push.re \ @@ -314,19 +468,84 @@ re2c_SRC_DOC_EXT = \ examples/rust/submatch/01_stags.re \ examples/rust/submatch/02_mtags.re \ examples/rust/submatch/03_captures.re \ - examples/rust/submatch/04_posix_captures.re - -DOC_C = doc/re2c.1 -DOCS = $(DOC_C) - + examples/rust/submatch/04_posix_captures.re \ + examples/v/01_basic.re \ + examples/v/01_basic.v \ + examples/v/conditions/parse_u32_blocks.re \ + examples/v/conditions/parse_u32_conditions.re \ + examples/v/encodings/unicode_identifier.re \ + examples/v/eof/01_sentinel.re \ + examples/v/eof/02_bounds_checking.re \ + examples/v/eof/03_eof_rule.re \ + examples/v/eof/04_fake_sentinel.re \ + examples/v/fill/01_fill.re \ + examples/v/fill/02_fill.re \ + examples/v/headers/header.re \ + examples/v/headers/lexer/state.v \ + examples/v/includes/definitions.v \ + examples/v/includes/include.re \ + examples/v/reuse/reuse.re \ + examples/v/reuse/usedir.re \ + examples/v/state/push.re \ + examples/v/submatch/01_stags_fill.re \ + examples/v/submatch/01_stags.re \ + examples/v/submatch/02_mtags.re \ + examples/v/submatch/03_captures.re \ + examples/v/submatch/04_posix_captures.re \ + examples/zig/01_basic.re \ + examples/zig/01_basic.zig \ + examples/zig/conditions/parse_u32_blocks.re \ + examples/zig/conditions/parse_u32_conditions.re \ + examples/zig/encodings/unicode_identifier.re \ + examples/zig/eof/01_sentinel.re \ + examples/zig/eof/02_bounds_checking.re \ + examples/zig/eof/03_eof_rule.re \ + examples/zig/eof/04_fake_sentinel.re \ + examples/zig/fill/01_fill.re \ + examples/zig/fill/02_fill.re \ + examples/zig/headers/header.re \ + examples/zig/headers/lexer/state.zig \ + examples/zig/includes/definitions.zig \ + examples/zig/includes/include.re \ + examples/zig/reuse/reuse.re \ + examples/zig/reuse/usedir.re \ + examples/zig/state/push.re \ + examples/zig/submatch/01_stags_fill.re \ + examples/zig/submatch/01_stags.re \ + examples/zig/submatch/02_mtags.re \ + examples/zig/submatch/03_captures.re \ + examples/zig/submatch/04_posix_captures.re + +DOCS = doc/re2c.1 +if WITH_DLANG +DOCS += doc/re2d.1 +endif if WITH_GOLANG -DOC_GO = doc/re2go.1 -DOCS += $(DOC_GO) +DOCS += doc/re2go.1 +endif +if WITH_HASKELL +DOCS += doc/re2hs.1 +endif +if WITH_JAVA +DOCS += doc/re2java.1 +endif +if WITH_JS +DOCS += doc/re2js.1 +endif +if WITH_OCAML +DOCS += doc/re2ocaml.1 +endif +if WITH_PYTHON +DOCS += doc/re2py.1 endif - if WITH_RUST -DOC_RUST = doc/re2rust.1 -DOCS += $(DOC_RUST) +DOCS += doc/re2rust.1 +endif +if WITH_VLANG +DOCS += doc/re2v.1 +endif +if WITH_ZIG +DOCS += doc/re2zig.1 endif man_MANS = $(DOCS) @@ -438,27 +657,13 @@ docs: $(DOCS) $(re2c_GEN_HELP) RST2TXT = $(top_srcdir)/build/rst2txt.py RST2MAN = $(top_srcdir)/build/rst2man.py SPLITMAN = $(top_srcdir)/build/split_man.py -# generate manpage for C -$(DOC_C): $(re2c_SRC_DOC) $(re2c_SRC_DOC_EXT) $(SPLITMAN) $(RST2MAN) +# generate manpage +doc/re2%.1: $(re2c_SRC_DOC) $(re2c_SRC_DOC_EXT) $(SPLITMAN) $(RST2MAN) $(AM_V_at)$(MKDIR_P) $(@D) - $(AM_V_GEN)$(PYTHON) $(SPLITMAN) $(top_builddir)/$(re2c_SRC_DOC) $(top_builddir)/$(re2c_SRC_DOC).c c \ - && $(PYTHON) $(RST2MAN) --tab-width=4 $(top_builddir)/$(re2c_SRC_DOC).c > $@ \ - && cp $@ $(top_srcdir)/$(re2c_BOOT_DOC_C) \ - && rm $(top_builddir)/$(re2c_SRC_DOC).c -# generate manpage for Go -$(DOC_GO): $(re2c_SRC_DOC) $(re2c_SRC_DOC_EXT) $(SPLITMAN) $(RST2MAN) - $(AM_V_at)$(MKDIR_P) $(@D) - $(AM_V_GEN)$(PYTHON) $(SPLITMAN) $(top_builddir)/$(re2c_SRC_DOC) $(top_builddir)/$(re2c_SRC_DOC).go go \ - && $(PYTHON) $(RST2MAN) --tab-width=4 $(top_builddir)/$(re2c_SRC_DOC).go > $@ \ - && cp $@ $(top_srcdir)/$(re2c_BOOT_DOC_GO) \ - && rm $(top_builddir)/$(re2c_SRC_DOC).go -# generate manpage for Rust -$(DOC_RUST): $(re2c_SRC_DOC) $(re2c_SRC_DOC_EXT) $(SPLITMAN) $(RST2MAN) - $(AM_V_at)$(MKDIR_P) $(@D) - $(AM_V_GEN)$(PYTHON) $(SPLITMAN) $(top_builddir)/$(re2c_SRC_DOC) $(top_builddir)/$(re2c_SRC_DOC).rust rust \ - && $(PYTHON) $(RST2MAN) --tab-width=4 $(top_builddir)/$(re2c_SRC_DOC).rust > $@ \ - && cp $@ $(top_srcdir)/$(re2c_BOOT_DOC_RUST) \ - && rm $(top_builddir)/$(re2c_SRC_DOC).rust + $(AM_V_GEN)$(PYTHON) $(SPLITMAN) $(top_builddir)/$(re2c_SRC_DOC) $(top_builddir)/$@.rst \ + && $(PYTHON) $(RST2MAN) --tab-width=4 $(top_builddir)/$@.rst > $@ \ + && cp $@ $(top_srcdir)/bootstrap/$@ \ + && rm $(top_builddir)/$@.rst # generate help $(re2c_GEN_HELP): $(re2c_CUSTOM_HELP) $(re2c_SRC_DOC_EXT) $(RST2TXT) $(AM_V_at)$(MKDIR_P) $(@D) @@ -467,18 +672,10 @@ $(re2c_GEN_HELP): $(re2c_CUSTOM_HELP) $(re2c_SRC_DOC_EXT) $(RST2TXT) else docs: $(DOCS) $(re2c_GEN_HELP) $(AM_V_at)echo "Reconfigure with --enable-docs to rebuild docs" -# copy bootstrap manpage for C -$(DOC_C): $(re2c_BOOT_DOC_C) - $(AM_V_at)$(MKDIR_P) $(@D) - $(AM_V_GEN)cp $(top_srcdir)/$(re2c_BOOT_DOC_C) $@ -# copy bootstrap manpage for Go -$(DOC_GO): $(re2c_BOOT_DOC_GO) - $(AM_V_at)$(MKDIR_P) $(@D) - $(AM_V_GEN)cp $(top_srcdir)/$(re2c_BOOT_DOC_GO) $@ -# copy bootstrap manpage for Rust -$(DOC_RUST): $(re2c_BOOT_DOC_RUST) +# copy bootstrap manpage +doc/re2%.1: bootstrap/doc/re2%.1 $(AM_V_at)$(MKDIR_P) $(@D) - $(AM_V_GEN)cp $(top_srcdir)/$(re2c_BOOT_DOC_RUST) $@ + $(AM_V_GEN)cp $(top_srcdir)/bootstrap/$@ $@ # copy bootstrap help $(re2c_GEN_HELP): $(re2c_BOOT_HELP) $(AM_V_at)$(MKDIR_P) $(@D) diff --git a/bootstrap/doc/re2c.1 b/bootstrap/doc/re2c.1 index b765b8ac8..bc8abfe92 100644 --- a/bootstrap/doc/re2c.1 +++ b/bootstrap/doc/re2c.1 @@ -242,8 +242,8 @@ should be defined as pointers of type \fBYYCTYPE*\fP\&. .B \fBRecord API\fP (\fIadded in version 4.0\fP) Record API is useful in cases when lexer state must be stored in a struct. -It is enabled with option \fB\-\-api record\fP or configuration -\fBre2c:api = record\fP\&. This API consists of a variable \fByyrecord\fP (the +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the name can be overridden with \fBre2c:variable:yyrecord\fP) that should be defined as a struct with fields \fByycursor\fP, \fByymarker\fP, \fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code need to be defined, @@ -255,8 +255,8 @@ and their names can be configured). .TP .B \fBGeneric API\fP (\fIadded in version 0.14\fP) -This is the most flexible API provided by re2c. It is enabled with -\fB\-\-api generic\fP option or \fBre2c:api = generic\fP configuration. +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. This API contains primitives for generic operations: \fBYYPEEK\fP, \fBYYSKIP\fP, @@ -2795,53 +2795,64 @@ int main() { .SH SUBMATCH EXTRACTION .sp re2c has two options for submatch extraction. -.sp -The first option is \fB\-T \-\-tags\fP\&. With this option one can use standalone tags -of the form \fB@stag\fP and \fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary -used\-defined names. Tags can be used anywhere inside of a regular expression; -semantically they are just position markers. Tags of the form \fB@stag\fP are -called s\-tags: they denote a single submatch value (the last input position -where this tag matched). Tags of the form \fB#mtag\fP are called m\-tags: they -denote multiple submatch values (the whole history of repetitions of this tag). -All tags should be defined by the user as variables with the corresponding -names. With standalone tags re2c uses leftmost greedy disambiguation: submatch -positions correspond to the leftmost matching path through the regular -expression. -.sp -The second option is \fB\-P \-\-posix\-captures\fP: it enables POSIX\-compliant -capturing groups. In this mode parentheses in regular expressions denote the -beginning and the end of capturing groups; the whole regular expression is group -number zero. The number of groups for the matching rule is stored in a variable -\fByynmatch\fP, and submatch results are stored in \fByypmatch\fP array. Both -\fByynmatch\fP and \fByypmatch\fP should be defined by the user, and \fByypmatch\fP -size must be at least \fB[yynmatch * 2]\fP\&. re2c provides a directive -\fB/*!maxnmatch:re2c*/\fP that defines \fBYYMAXNMATCH\fP: a constant equal to the -maximal value of \fByynmatch\fP among all rules. Note that re2c implements -POSIX\-compliant disambiguation: each subexpression matches as long as possible, -and subexpressions that start earlier in regular expression have priority over -those starting later. Capturing groups are translated into s\-tags under the -hood, therefore we use the word \(dqtag\(dq to describe them as well. -.sp -With both \fB\-P \-\-posix\-captures\fP and \fBT \-\-tags\fP options re2c uses efficient -submatch extraction algorithm described in the -\fI\%Tagged Deterministic Finite Automata with Lookahead\fP -paper. The overhead on submatch extraction in the generated lexer grows with the -number of tags \-\-\- if this number is moderate, the overhead is barely -noticeable. In the lexer tags are implemented using a number of tag variables -generated by re2c. There is no one\-to\-one correspondence between tag variables -and tags: a single variable may be reused for different tags, and one tag may -require multiple variables to hold all its ambiguous values. Eventually -ambiguity is resolved, and only one final variable per tag survives. When a rule -matches, all its tags are set to the values of the corresponding tag variables. -The exact number of tag variables is unknown to the user; this number is -determined by re2c. However, tag variables should be defined by the user as a -part of the lexer state and updated by \fBYYFILL\fP, therefore re2c provides -directives \fB/*!stags:re2c*/\fP and \fB/*!mtags:re2c*/\fP that can be used to -declare, initialize and manipulate tag variables. These directives have two -optional configurations: \fBformat = \(dq@@\(dq;\fP (specifies the template where \fB@@\fP -is substituted with the name of each tag variable), and \fBseparator = \(dq\(dq;\fP -(specifies the piece of code used to join the generated pieces for different -tag variables). +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. .sp S\-tags support the following operations: .INDENT 0.0 @@ -3069,7 +3080,7 @@ int main() { .UNINDENT .UNINDENT .sp -Here is an example of using POSIX capturing groups to parse semantic versions. +Here is an example of using capturing groups to parse semantic versions. .INDENT 0.0 .INDENT 3.5 .sp @@ -3079,9 +3090,6 @@ Here is an example of using POSIX capturing groups to parse semantic versions. #include #include -// Maximum number of capturing groups among all rules. -/*!maxnmatch:re2c*/ - struct SemVer { int major, minor, patch; }; static int s2n(const char *s, const char *e) { // pre\-parsed string to number @@ -3093,9 +3101,8 @@ static int s2n(const char *s, const char *e) { // pre\-parsed string to number static bool lex(const char *str, SemVer &ver) { const char *YYCURSOR = str, *YYMARKER; - // Allocate memory for capturing parentheses (twice the number of groups). - const char *yypmatch[YYMAXNMATCH * 2]; - size_t yynmatch; + // Final tag variables available in semantic action. + /*!svars:re2c format = \(aqconst char *@@;\en\(aq; */ // Intermediate tag variables used by the lexer (must be autogenerated). /*!stags:re2c format = \(aqconst char *@@;\en\(aq; */ @@ -3103,18 +3110,15 @@ static bool lex(const char *str, SemVer &ver) { /*!re2c re2c:yyfill:enable = 0; re2c:define:YYCTYPE = char; - re2c:posix\-captures = 1; + re2c:captvars = 1; num = [0\-9]+; (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { - // \(gayynmatch\(ga is the number of capturing groups - assert(yynmatch == 4); - // Even \(gayypmatch\(ga values are for opening parentheses, odd values - // are for closing parentheses, the first group is the whole match. - ver.major = s2n(yypmatch[2], yypmatch[3]); - ver.minor = s2n(yypmatch[4], yypmatch[5]); - ver.patch = yypmatch[6] ? s2n(yypmatch[6] + 1, yypmatch[7]) : 0; + (void) yytl0; (void) yytr0; // some variables are unused + ver.major = s2n(yytl1, yytr1); + ver.minor = s2n(yytl2, yytr2); + ver.patch = yytl3 ? s2n(yytl3 + 1, yytr3) : 0; return true; } * { return false; } diff --git a/bootstrap/doc/re2d.1 b/bootstrap/doc/re2d.1 new file mode 100644 index 000000000..55caf63f5 --- /dev/null +++ b/bootstrap/doc/re2d.1 @@ -0,0 +1,3449 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for D, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-i +module main; + +private bool lex(const(char)* yycursor) { + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + number { return true; } + * { return false; } + */ +} + +void main() { + assert(lex(\(dq1234\(dq)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +/* Generated by re2d */ +// re2d $INPUT \-o $OUTPUT \-i +module main; + +private bool lex(const(char)* yycursor) { + +{ + char yych; + yych = *yycursor; + switch (yych) { + case \(aq1\(aq: .. case \(aq9\(aq: goto yy2; + default: goto yy1; + } +yy1: + ++yycursor; + { return false; } +yy2: + yych = *++yycursor; + switch (yych) { + case \(aq0\(aq: .. case \(aq9\(aq: goto yy2; + default: goto yy3; + } +yy3: + { return true; } +} + +} + +void main() { + assert(lex(\(dq1234\(dq)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2d has three API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBSimple API\fP +This is the default API for D backend. It consists of primitives +\fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP and \fBYYLIMIT\fP, which +should be defined as pointers of type \fBYYCTYPE*\fP\&. +.nf + +.fi +.sp +.TP +.B \fBRecord API\fP +Record API is useful in cases when lexer state must be stored in a struct. +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the +name can be overridden with \fBre2c:variable:yyrecord\fP) that should be +defined as a struct with fields \fByycursor\fP, \fByymarker\fP, \fByyctxmarker\fP, +\fByylimit\fP (only the fields used by the generated code need to be defined, +and their names can be configured). +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +// Expect a null\-terminated string. +private int lex(const(char)* yycursor) { + uint count = 0; + + for (;;) { + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + + * { return \-1; } + [\ex00] { return count; } + [a\-z]+ { ++count; continue; } + [ ]+ { continue; } + */ + } + assert(0); // unreachable +} + +void main() { + assert(lex(\(dq\(dq) == 0); + assert(lex(\(dqone two three\(dq) == 3); + assert(lex(\(dqf0ur\(dq) == \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +// Expect a null\-terminated string. +private int lex(immutable char[] s) { + const(char)* yycursor = s.ptr, yylimit = s.ptr + s.length, yymarker; + int count = 0; + + for (;;) { + /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1; } + $ { return count; } + str { ++count; continue; } + [ ]+ { continue; } + */ + } + assert(0); // unreachable +} + +void main() { + assert(lex(\(dq\(dq) == 0); + assert(lex(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq) == 3); + assert(lex(\(dq\(aqunterminated\e\e\(aq\(dq) == \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +import core.stdc.stdlib; +import core.stdc.string; + +/*!max:re2c*/ + +private int lex(immutable char[] s) { + // Make a copy of the string with YYMAXFILL zeroes at the end. + char *buf = cast(char*) malloc(s.length + YYMaxFill); + memcpy(buf, cast(const(void*)) s, s.length); + memset(buf + s.length, 0, YYMaxFill); + + const(char)* yycursor = buf; + const(char)* yylimit = buf + s.length + YYMaxFill; + int count = 0; + +loop: + /*!re2c + re2c:define:YYCTYPE = char; + re2c:define:YYFILL = \(dqgoto fail;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + if (yycursor \- 1 == buf + s.length) goto exit; else goto fail; + } + str { ++count; goto loop; } + [ ]+ { goto loop; } + * { goto fail; } + */ +fail: + count = \-1; +exit: + free(buf); + return count; +} + +void main() { + assert(lex(\(dq\(dq) == 0); + assert(lex(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq) == 3); + assert(lex(\(dq\(aqunterminated\e\e\(aq\(dq) == \-1); + assert(lex(\(dq\(aqunexpected \e0 null\e\e\(aq\(dq) == \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +import core.stdc.stdlib; +import core.stdc.string; + +private int lex(immutable char[] s) { + // For the sake of example create a string without terminating null. + char *buf = cast(char*) malloc(s.length); + memcpy(buf, cast(const(void*)) s, s.length); + + const(char) *cur = buf, lim = buf + s.length; + int count = 0; + + for (;;) { + /*!re2c + re2c:api = generic; + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = char; + re2c:define:YYPEEK = \(dqcur < lim ? *cur : 0\(dq; // fake null + re2c:define:YYSKIP = \(dq++cur;\(dq; + + * { count = \-1; break; } + [\ex00] { break;{} } + [a\-z]+ { ++count; continue;{} } + [ ]+ { continue; } + */ + } + + free(buf); + return count; +} + +void main() { + assert(lex(\(dq\(dq) == 0); + assert(lex(\(dqone two three \(dq) == 3); + assert(lex(\(dqf0ur\(dq) == \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +import core.stdc.string; +import core.stdc.stdio; + +enum BUFSIZE = 4095; + +struct Input { + FILE* file; + char[BUFSIZE + 1] buffer;// +1 for sentinel + char* yylimit, yycursor, yymarker, token; + bool eof; +}; + +private int fill(ref Input it) { + if (it.eof) return 1; + + const size_t shift = it.token \- it.buffer.ptr; + const size_t used = it.yylimit \- it.token; + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (shift < 1) return 2; + + // Shift buffer contents (discard everything up to the current token). + memmove(cast(void*)it.buffer.ptr, it.token, used); + it.yylimit \-= shift; + it.yycursor \-= shift; + it.yymarker \-= shift; + it.token \-= shift; + + // Fill free space at the end of buffer with new data from file. + it.yylimit += fread(it.yylimit, 1, BUFSIZE \- used, it.file); + it.yylimit[0] = 0; + it.eof = it.yylimit < (it.buffer.ptr + BUFSIZE); + return 0; +} + +private int lex(ref Input yyrecord) { + int count = 0; + for (;;) { + yyrecord.token = yyrecord.yycursor; + /*!re2c + re2c:api = record; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYFILL = \(dqfill(yyrecord) == 0\(dq; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1; } + $ { return count; } + str { ++count; continue; } + [ ]+ { continue; } + */ + } + assert(0); +} + +void main() { + const char[] fname = \(dqinput\(dq; + const char[] content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + FILE* f = fopen(fname.ptr, \(dqw\(dq); + for (int i = 0; i < BUFSIZE; ++i) { + fwrite(cast(const(void*)) content.ptr, 1, content.length \- 1, f); + } + fclose(f); + int count = 3 * BUFSIZE; // number of quoted strings written to file + + // Initialize lexer state: all pointers are at the end of buffer. + Input it; + it.file = fopen(fname.ptr, \(dqr\(dq); + it.yycursor = it.yymarker = it.token = it.yylimit = it.buffer.ptr + BUFSIZE; + it.eof = 0; + // Sentinel (at YYLIMIT pointer) is set to zero, which triggers YYFILL. + it.yylimit[0] = 0; + + // Run the lexer. + assert(lex(it) == count); + + // Cleanup: remove input file. + fclose(it.file); + remove(fname.ptr); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +import core.stdc.string; +import core.stdc.stdio; + +/*!max:re2c*/ +enum BufSize = (4096 \- YYMaxFill); + +struct Input { + FILE* file; + char[BufSize + YYMaxFill] buffer; + char* yylimit, yycursor, token; + bool eof; +}; + +private int fill(ref Input it, size_t need) { + if (it.eof) return 1; + + const size_t shift = it.token \- it.buffer.ptr; + const size_t used = it.yylimit \- it.token; + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (shift < need) return 2; + + // Shift buffer contents (discard everything up to the current token). + memmove(it.buffer.ptr, it.token, used); + it.yylimit \-= shift; + it.yycursor \-= shift; + it.token \-= shift; + + // Fill free space at the end of buffer with new data from file. + it.yylimit += fread(it.yylimit, 1, BufSize \- used, it.file); + + // If read less than expected, this is end of input => add zero padding + // so that the lexer can access characters at the end of buffer. + if (it.yylimit < it.buffer.ptr + BufSize) { + it.eof = true; + memset(it.yylimit, 0, YYMaxFill); + it.yylimit += YYMaxFill; + } + + return 0; +} + +private int lex(ref Input yyrecord) { + int count = 0; + for (;;) { + yyrecord.token = yyrecord.yycursor; + /*!re2c + re2c:api = record; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYFILL = \(dqif (fill(yyrecord, @@) != 0) return \-1;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return yyrecord.token == yyrecord.yylimit \- YYMaxFill ? count : \-1; + } + str { ++count; continue; } + [ ]+ { continue; } + * { return \-1; } + */ + } + assert(0); +} + +void main() { + const char[] fname = \(dqinput\(dq; + const char[] content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + FILE* f = fopen(fname.ptr, \(dqw\(dq); + for (int i = 0; i < BufSize; ++i) { + fwrite(content.ptr, 1, content.length \- 1, f); + } + fclose(f); + int count = 3 * BufSize; // number of quoted strings written to file + + // Initialize lexer state: all pointers are at the end of buffer. + // This immediately triggers YYFILL, as the check \(gait.yycursor < it.yylimit\(ga fails. + Input it; + it.file = fopen(fname.ptr, \(dqr\(dq); + it.yycursor = it.token = it.yylimit = it.buffer.ptr + BufSize; + it.eof = 0; + + // Run the lexer. + assert(lex(it) == count); + + // Cleanup: remove input file. + fclose(it.file); + remove(fname.ptr); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-i +module main; + +enum ERROR = ulong.max; + +private void add(ulong BASE)(ref ulong u, int d) { + u = u * BASE + d; + if (u > uint.max) { u = ERROR; } +} + +private ulong parse_u32(const(char)* s) { + const(char)* yycursor = s, yymarker; + ulong u = 0; + + /*!re2c + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = char; + + end = \(dq\ex00\(dq; + + \(aq0b\(aq / [01] { goto bin; } + \(dq0\(dq { goto oct; } + \(dq\(dq / [1\-9] { goto dec; } + \(aq0x\(aq / [0\-9a\-fA\-F] { goto hex; } + * { return ERROR; } + */ +bin: + /*!re2c + end { return u; } + [01] { add!(2)(u, yycursor[\-1] \- \(aq0\(aq); goto bin; } + * { return ERROR; } + */ +oct: + /*!re2c + end { return u; } + [0\-7] { add!(8)(u, yycursor[\-1] \- \(aq0\(aq); goto oct; } + * { return ERROR; } + */ +dec: + /*!re2c + end { return u; } + [0\-9] { add!(10)(u, yycursor[\-1] \- \(aq0\(aq); goto dec; } + * { return ERROR; } + */ +hex: + /*!re2c + end { return u; } + [0\-9] { add!(16)(u, yycursor[\-1] \- \(aq0\(aq); goto hex; } + [a\-f] { add!(16)(u, yycursor[\-1] \- \(aqa\(aq + 10); goto hex; } + [A\-F] { add!(16)(u, yycursor[\-1] \- \(aqA\(aq + 10); goto hex; } + * { return ERROR; } + */ +} + +void main() { + assert(parse_u32(\(dq\(dq) == ERROR); + assert(parse_u32(\(dq1234567890\(dq) == 1234567890); + assert(parse_u32(\(dq0b1101\(dq) == 13); + assert(parse_u32(\(dq0x7Fe\(dq) == 2046); + assert(parse_u32(\(dq0644\(dq) == 420); + assert(parse_u32(\(dq9999999999\(dq) == ERROR); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-ci +module main; + +enum ERROR = ulong.max; +/*!conditions:re2c*/ + +private void add(ulong BASE)(ref ulong u, int d) { + u = u * BASE + d; + if (u > uint.max) { u = ERROR; } +} + +private ulong parse_u32(const(char)* s) { + const(char)* yycursor = s, yymarker; + YYCond yycond = YYCond.yycinit; + ulong u = 0; + + /*!re2c + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = char; + + <*> * { return ERROR; } + \(aq0b\(aq / [01] :=> bin + \(dq0\(dq :=> oct + \(dq\(dq / [1\-9] :=> dec + \(aq0x\(aq / [0\-9a\-fA\-F] :=> hex + \(dq\ex00\(dq { return u; } + [01] { add!(2)(u, yycursor[\-1] \- \(aq0\(aq); goto yyc_bin; } + [0\-7] { add!(8)(u, yycursor[\-1] \- \(aq0\(aq); goto yyc_oct; } + [0\-9] { add!(10)(u, yycursor[\-1] \- \(aq0\(aq); goto yyc_dec; } + [0\-9] { add!(16)(u, yycursor[\-1] \- \(aq0\(aq); goto yyc_hex; } + [a\-f] { add!(16)(u, yycursor[\-1] \- \(aqa\(aq + 10); goto yyc_hex; } + [A\-F] { add!(16)(u, yycursor[\-1] \- \(aqA\(aq + 10); goto yyc_hex; } + */ +} + + + +void main() { + assert(parse_u32(\(dq\(dq) == ERROR); + assert(parse_u32(\(dq1234567890\(dq) == 1234567890); + assert(parse_u32(\(dq0b1101\(dq) == 13); + assert(parse_u32(\(dq0x7Fe\(dq) == 2046); + assert(parse_u32(\(dq0644\(dq) == 420); + assert(parse_u32(\(dq9999999999\(dq) == ERROR); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-f +module main; + +import core.stdc.stdio; +import core.stdc.string; + +// Use a small buffer to cover the case when a lexeme doesn\(aqt fit. +// In real world use a larger buffer. +enum BUFSIZE = 10; + +struct State { + FILE* file; + char[BUFSIZE + 1] buffer; + char* yylimit, yycursor, yymarker, token; + int yystate; +}; + +enum Status {END, READY, WAITING, BAD_PACKET, BIG_PACKET}; + +private Status fill(ref State st) { + const size_t shift = st.token \- cast(char*)st.buffer; + const size_t used = st.yylimit \- st.token; + const size_t free = BUFSIZE \- used; + + // Error: no space. In real life can reallocate a larger buffer. + if (free < 1) return Status.BIG_PACKET; + + // Shift buffer contents (discard already processed data). + memmove(cast(void*)st.buffer, st.token, used); + st.yylimit \-= shift; + st.yycursor \-= shift; + st.yymarker \-= shift; + st.token \-= shift; + + // Fill free space at the end of buffer with new data. + const size_t read = fread(st.yylimit, 1, free, st.file); + st.yylimit += read; + st.yylimit[0] = 0; // append sentinel symbol + + return Status.READY; +} + +private Status lex(ref State yyrecord, uint* recv) { + char yych; + /*!getstate:re2c*/ + + for (;;) { + yyrecord.token = yyrecord.yycursor; + /*!re2c + re2c:api = record; + re2c:define:YYCTYPE = char; + re2c:define:YYFILL = \(dqreturn Status.WAITING;\(dq; + re2c:eof = 0; + + packet = [a\-z]+[;]; + + * { return Status.BAD_PACKET; } + $ { return Status.END; } + packet { *recv = *recv + 1; continue; } + */ + } + assert(0); // unreachable +} + +private void test(string[] packets, Status expect) { + // Create a pipe (open the same file for reading and writing). + const(char*) fname = \(dqpipe\(dq; + FILE* fw = fopen(fname, \(dqw\(dq); + FILE* fr = fopen(fname, \(dqr\(dq); + setvbuf(fw, null, _IONBF, 0); + setvbuf(fr, null, _IONBF, 0); + + // Initialize lexer state: \(gastate\(ga value is \-1, all pointers are at the end + // of buffer. + State st; + st.file = fr; + st.yycursor = st.yymarker = st.token = st.yylimit = cast(char*)st.buffer + BUFSIZE; + // Sentinel (at YYLIMIT pointer) is set to zero, which triggers YYFILL. + st.yylimit[0] = 0; + st.yystate = \-1; + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + Status status; + uint send = 0, recv = 0; + for (;;) { + status = lex(st, &recv); + if (status == Status.END) { + debug{printf(\(dqdone: got %u packets\en\(dq, recv);} + break; + } else if (status == Status.WAITING) { + debug{printf(\(dqwaiting...\en\(dq);} + if (send < packets.length) { + debug{printf(\(dqsent packet %u\en\(dq, send);} + fprintf(fw, \(dq%s\(dq, cast(char*)packets[send]); + ++send; + } + status = fill(st); + debug{printf(\(dqqueue: \(aq%s\(aq\en\(dq, cast(char*)st.buffer);} + if (status == Status.BIG_PACKET) { + debug{printf(\(dqerror: packet too big\en\(dq);} + break; + } + assert(status == Status.READY); + } else { + assert(status == Status.BAD_PACKET); + debug{printf(\(dqerror: ill\-formed packet\en\(dq);} + break; + } + } + + // Check results. + assert(status == expect); + if (status == Status.END) assert(recv == send); + + // Cleanup: remove input file. + fclose(fw); + fclose(fr); + remove(fname); +} + +void main() { + string[] packets1 = []; + string[] packets2 = [\(dqzero;\(dq, \(dqone;\(dq, \(dqtwo;\(dq, \(dqthree;\(dq, \(dqfour;\(dq]; + string[] packets3 = [\(dqzer0;\(dq]; + string[] packets4 = [\(dqlooooooooooong;\(dq]; + + test(packets1, Status.END); + test(packets2, Status.END); + test(packets3, Status.BAD_PACKET); + test(packets4, Status.BIG_PACKET); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +// This example shows how to combine reusable re2c blocks: two blocks +// (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +// in both blocks; the \(aqfish\(aq block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +enum What { COLOR, FISH, DUNNO }; + +/*!rules:re2c:colors + * { assert(false); } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { return What.COLOR; } +*/ + +/*!rules:re2c:fish + * { assert(false); } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { return What.FISH; } +*/ + +private What lex(const(char)* s) { + const(char)* yycursor = s, yymarker; + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { return What.DUNNO; } // overrides inherited \(aq*\(aq rules + */ +} + +void main() { + assert(lex(\(dqsalmon\(dq) == What.FISH); + assert(lex(\(dqwhat?\(dq) == What.DUNNO); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-\-input\-encoding utf8 +module main; + +import std.stdint; + +// This example supports multiple input encodings: UTF\-8 and UTF\-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding\-specific configurations. +/*!rules:re2c + re2c:yyfill:enable = 0; + + \(dq∀x ∃y\(dq { return 0; } + * { return 1; } +*/ + +private int lex_utf8(const(uint8_t)* s) { + const(uint8_t)* yycursor = s, yymarker; + /*!use:re2c + re2c:define:YYCTYPE = uint8_t; + re2c:encoding:utf8 = 1; + */ +} + +private int lex_utf32(const(uint32_t)* s) { + const(uint32_t)* yycursor = s, yymarker; + /*!use:re2c + re2c:define:YYCTYPE = uint32_t; + re2c:encoding:utf32 = 1; + */ +} + +void main() { + immutable uint8_t[] s8 = // UTF\-8 + [ 0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79 ]; + + immutable uint32_t[] s32 = // UTF32 + [ 0x00002200, 0x00000078, 0x00000020, 0x00002203, 0x00000079 ]; + + assert(lex_utf8(cast(const(uint8_t)*)s8) == 0); + assert(lex_utf32(cast(const(uint32_t)*)s32) == 0); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +struct SemVer { + int major; + int minor; + int patch; +}; + +private int s2n(const(char)* s, const(char)* e) { // pre\-parsed string to number + int n = 0; + for (; s < e; ++s) n = n * 10 + (*s \- \(aq0\(aq); + return n; +} + +private bool lex(const(char)* str, ref SemVer ver) { + const(char)* yycursor = str, yymarker; + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(aqconst(char)* @@;\en\(aq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(aqconst(char)* @@;\en\(aq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:tags = 1; + re2c:define:YYCTYPE = \(dqchar\(dq; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + ver.major = s2n(t1, t2); + ver.minor = s2n(t3, t4); + ver.patch = t5 != null ? s2n(t5, yycursor \- 1) : 0; + return true; + } + * { return false; } + */ +} + +void main() { + SemVer v; + assert(lex(\(dq23.34\(dq, v) && v.major == 23 && v.minor == 34 && v.patch == 0); + assert(lex(\(dq1.2.999\(dq, v) && v.major == 1 && v.minor == 2 && v.patch == 999); + assert(!lex(\(dq1.a\(dq, v)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-\-tags +module main; + +import core.stdc.string; +import core.stdc.stdio; +import std.stdio; + +enum BUFSIZE = 4095; + +struct Input { + FILE* file; + char[BUFSIZE + 1] buffer;// +1 for sentinel + char* yylimit, yycursor, yymarker, token; + // Intermediate tag variables must be part of the lexer state passed to YYFILL. + // They don\(aqt correspond to tags and should be autogenerated by re2c. + /*!stags:re2c format = \(aqchar* @@;\(aq; */ + bool eof; +}; + +struct SemVer { + int major; + int minor; + int patch; +}; + +private int s2n(const(char)* s, const(char)* e) { // pre\-parsed string to number + int n = 0; + for (; s < e; ++s) n = n * 10 + (*s \- \(aq0\(aq); + return n; +} + +private int fill(ref Input it) { + if (it.eof) return 1; + + const size_t shift = it.token \- it.buffer.ptr; + const size_t used = it.yylimit \- it.token; + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (shift < 1) return 2; + + // Shift buffer contents (discard everything up to the current token). + memmove(cast(void*)it.buffer.ptr, it.token, used); + it.yylimit \-= shift; + it.yycursor \-= shift; + it.yymarker \-= shift; + it.token \-= shift; + // Tag variables need to be shifted like other input positions. The check + // for non\-null is only needed if some tags are nested inside of alternative + // or repetition, so that they can have null value. + /*!stags:re2c format = \(dqif (it.@@) it.@@ \-= shift;\en\(dq; */ + + // Fill free space at the end of buffer with new data from file. + it.yylimit += fread(it.yylimit, 1, BUFSIZE \- used, it.file); + it.yylimit[0] = 0; + it.eof = it.yylimit < (it.buffer.ptr + BUFSIZE); + return 0; +} + +private bool lex(ref Input yyrecord, ref SemVer[] vers) { + // Final variables available in semantic actions. + /*!svars:re2c format = \(aqchar* @@;\(aq; */ + for (;;) { + yyrecord.token = yyrecord.yycursor; + /*!re2c + re2c:api = record; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYFILL = \(dqfill(yyrecord) == 0\(dq; + re2c:eof = 0; + + num = [0\-9]+; + + num @t1 \(dq.\(dq @t2 num @t3 (\(dq.\(dq @t4 num)? [\en] { + int major = s2n(yyrecord.token, t1); + int minor = s2n(t2, t3); + int patch = t4 != null ? s2n(t4, yyrecord.yycursor \- 1) : 0; + SemVer ver = SemVer(major, minor, patch); + vers ~= ver; + continue; + } + $ { return true; } + * { return false; } + */ + } + assert(0); +} + +void main() { + const char[] fname = \(dqinput\(dq; + const char[] content = \(dq1.22.333\en\(aq \(dq; + + SemVer[BUFSIZE] expect = SemVer(1, 22, 333); + SemVer[] actual; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + FILE* f = fopen(fname.ptr, \(dqw\(dq); + for (int i = 0; i < BUFSIZE; ++i) { + fwrite(cast(const(void*)) content.ptr, 1, content.length \- 2, f); // skip null\-terminator + } + fclose(f); + + // Initialize lexer state: all pointers are at the end of buffer. + Input it; + it.file = fopen(fname.ptr, \(dqr\(dq); + it.yycursor = it.yymarker = it.token = it.yylimit = it.buffer.ptr + BUFSIZE; + it.eof = 0; + // Sentinel (at YYLIMIT pointer) is set to zero, which triggers YYFILL. + it.yylimit[0] = 0; + + // Run the lexer. + assert(lex(it, actual) && actual == expect); + + // Cleanup: remove input file. + fclose(it.file); + remove(fname.ptr); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +struct SemVer { int major, minor, patch; }; + +private int s2n(const(char)* s, const(char)* e) { // pre\-parsed string to number + int n = 0; + for (; s < e; ++s) n = n * 10 + (*s \- \(aq0\(aq); + return n; +} + +private bool lex(const(char)* str, ref SemVer ver) { + const(char)* yycursor = str, yymarker; + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(aqconst(char)* @@;\en\(aq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(aqconst(char)* @@;\en\(aq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:captvars = 1; + re2c:define:YYCTYPE = \(dqchar\(dq; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + ver.major = s2n(yytl1, yytr1); + ver.minor = s2n(yytl2, yytr2); + ver.patch = yytl3 ? s2n(yytl3 + 1, yytr3) : 0; + return true; + } + * { return false; } + */ +} + +void main() { + SemVer v; + assert(lex(\(dq23.34\(dq, v) && v.major == 23 && v.minor == 34 && v.patch == 0); + assert(lex(\(dq1.2.999\(dq, v) && v.major == 1 && v.minor == 2 && v.patch == 999); + assert(!lex(\(dq1.a\(dq, v)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT +module main; + +enum MtagRoot = \-1; + +// An m\-tag tree is a way to store histories with an O(1) copy operation. +// Histories naturally form a tree, as they have common start and fork at some +// point. The tree is stored as an array of pairs (tag value, link to parent). +// An m\-tag is represented with a single link in the tree (array index). +struct Mtag { + const(char)* elem; // tag value + int pred; // index of the predecessor node or root +}; + +alias MtagTrie = Mtag[]; +alias Ver = int[]; + +private int s2n(const(char)* s, const(char)* e) { // pre\-parsed string to number + int n = 0; + for (; s < e; ++s) n = n * 10 + (*s \- \(aq0\(aq); + return n; +} + +// Append a single value to an m\-tag history. +private void add_mtag(ref MtagTrie trie, ref int mtag, const(char)* value) { + Mtag m = {value, mtag}; + mtag = cast(int)trie.length; + trie ~= [m]; +} + +// Recursively unwind tag histories and collect version components. +private void unfold(const ref MtagTrie trie, int x, int y, ref Ver ver) { + // Reached the root of the m\-tag tree, stop recursion. + if (x == MtagRoot && y == MtagRoot) return; + + // Unwind history further. + unfold(trie, trie[x].pred, trie[y].pred, ver); + + // Get tag values. Tag histories must have equal length. + assert(x != MtagRoot && y != MtagRoot); + const(char)* ex = trie[x].elem, ey = trie[y].elem; + + if (ex != null && ey != null) { + // Both tags are valid pointers, extract component. + ver ~= [s2n(ex, ey)]; + } else { + // Both tags are null (this corresponds to zero repetitions). + assert(ex == null && ey == null); + } +} + +private bool parse(const(char)* str, ref Ver ver) { + const(char)* yycursor = str, yymarker; + MtagTrie mt; + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqconst(char)* @@;\(dq; */ + /*!mvars:re2c format = \(dqint @@;\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqconst(char)* @@ = null;\(dq; */ + /*!mtags:re2c format = \(dqint @@ = MtagRoot;\(dq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:tags = 1; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYMTAGP = \(dqadd_mtag(mt, @@, yycursor);\(dq; + re2c:define:YYMTAGN = \(dqadd_mtag(mt, @@, null);\(dq; + + num = [0\-9]+; + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + ver = []; + ver ~= [s2n(t1, t2)]; + unfold(mt, t3, t4, ver); + return true; + } + * { return false; } + */ +} + +void main() { + Ver v; + assert(parse(\(dq1\(dq, v) && v == [1]); + assert(parse(\(dq1.2.3.4.5.6.7\(dq, v) && v == [1, 2, 3, 4, 5, 6, 7]); + assert(!parse(\(dq1.2.\(dq, v)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-8 \-i +module main; + +/*!include:re2c \(dqunicode_categories.re\(dq */ + +private int lex(const(char)* s) { + const(char)* yycursor = s, yymarker; + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + identifier { return 0; } + * { return 1; } + */ +} + +void main() { + assert(lex(\(dq_Ыдентификатор\(dq) == 0); + assert(lex(\(dq!!!\(dq)==1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.d) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +enum Result{ OK, FAIL }; + +/*!re2c + number = [1\-9][0\-9]*; +*/ + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { return Result.OK; } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-i + +/*!include:re2c \(dqdefinitions.d\(dq */ + +private Result lex(const(char)* s) { + const(char)* yycursor = s, yymarker; + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:yyfill:enable = 0; + + * { return Result.FAIL; } + number { return Result.OK; } + !include \(dqextra_rules.re.inc\(dq; + */ +} + +void main() { + assert(lex(\(dq123\(dq) == Result.OK); + assert(lex(\(dq123.4567\(dq) == Result.OK); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2d $INPUT \-o $OUTPUT \-i \-\-header lexer/state.d +module main; + +import core.stdc.stddef; +import lexer.state; // the module is generated by re2c + +/*!header:re2c:on*/ +module lexer.state; + +struct LexerState { + const(char)* str, yycursor; + /*!stags:re2c format = \(dqconst(char)* @@;\(dq; */ +}; +/*!header:re2c:off*/ + +private long lex(ref LexerState yyrecord) { + const(char)* t; + /*!re2c + re2c:api = record; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:header = \(dqlexer/state.d\(dq; + + [a]* @t [b]* { return t \- yyrecord.str; } + */ +} + +void main() { + const(char)* s = \(dqab\(dq; + LexerState st = {s, s /*!stags:re2c format = \(dq, null\(dq; */}; + assert(lex(st) == 1); + + const(char)* s2 = \(dqaaabbbbbbbb\(dq; + LexerState st2 = {s2, s2 /*!stags:re2c format = \(dq, null\(dq; */}; + assert(lex(st2) == 3); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +/* Generated by re2c */ + +module lexer.state; + +struct LexerState { + const(char)* str, yycursor; + const(char)* yyt1; +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/doc/re2go.1 b/bootstrap/doc/re2go.1 index 0f8e280c3..2fc1d9a65 100644 --- a/bootstrap/doc/re2go.1 +++ b/bootstrap/doc/re2go.1 @@ -236,8 +236,8 @@ program: .TP .B \fBSimple API\fP (\fIadded in version 4.0\fP) -This is a basic API that can be enabled with option \fB\-\-api simple\fP or -configuration \fBre2c:api = simple\fP\&. It consists of the following +This is a basic API that can be enabled with \fB\-\-api simple\fP option or +\fBre2c:api = simple\fP configuration. It consists of the following primitives: \fBYYINPUT\fP (which should be defined as a sequence of code units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, \fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). @@ -249,8 +249,8 @@ units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, .B \fBRecord API\fP (\fIadded in version 4.0\fP) Record API is useful in cases when lexer state must be stored in a struct. -It is enabled with option \fB\-\-api record\fP or configuration -\fBre2c:api = record\fP\&. This API consists of a variable \fByyrecord\fP (the +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the name can be overridden with \fBre2c:variable:yyrecord\fP) that should be defined as a struct with fields \fByyinput\fP, \fByycursor\fP, \fByymarker\fP, \fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code @@ -261,10 +261,8 @@ need to be defined, and their names can be configured). .sp .TP .B \fBGeneric API\fP -(\fIadded in version 0.14\fP) -This is the default API for the Go backend. It is enabled with -\fB\-\-api generic\fP option or \fBre2c:api = generic\fP configuration. -This API contains primitives for generic operations: +This is the most flexible API and the default API for the Go backend. +It contains primitives for generic operations: \fBYYPEEK\fP, \fBYYSKIP\fP, \fBYYBACKUP\fP, @@ -2823,53 +2821,64 @@ func main() { .SH SUBMATCH EXTRACTION .sp re2c has two options for submatch extraction. -.sp -The first option is \fB\-T \-\-tags\fP\&. With this option one can use standalone tags -of the form \fB@stag\fP and \fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary -used\-defined names. Tags can be used anywhere inside of a regular expression; -semantically they are just position markers. Tags of the form \fB@stag\fP are -called s\-tags: they denote a single submatch value (the last input position -where this tag matched). Tags of the form \fB#mtag\fP are called m\-tags: they -denote multiple submatch values (the whole history of repetitions of this tag). -All tags should be defined by the user as variables with the corresponding -names. With standalone tags re2c uses leftmost greedy disambiguation: submatch -positions correspond to the leftmost matching path through the regular -expression. -.sp -The second option is \fB\-P \-\-posix\-captures\fP: it enables POSIX\-compliant -capturing groups. In this mode parentheses in regular expressions denote the -beginning and the end of capturing groups; the whole regular expression is group -number zero. The number of groups for the matching rule is stored in a variable -\fByynmatch\fP, and submatch results are stored in \fByypmatch\fP array. Both -\fByynmatch\fP and \fByypmatch\fP should be defined by the user, and \fByypmatch\fP -size must be at least \fB[yynmatch * 2]\fP\&. re2c provides a directive -\fB/*!maxnmatch:re2c*/\fP that defines \fBYYMAXNMATCH\fP: a constant equal to the -maximal value of \fByynmatch\fP among all rules. Note that re2c implements -POSIX\-compliant disambiguation: each subexpression matches as long as possible, -and subexpressions that start earlier in regular expression have priority over -those starting later. Capturing groups are translated into s\-tags under the -hood, therefore we use the word \(dqtag\(dq to describe them as well. -.sp -With both \fB\-P \-\-posix\-captures\fP and \fBT \-\-tags\fP options re2c uses efficient -submatch extraction algorithm described in the -\fI\%Tagged Deterministic Finite Automata with Lookahead\fP -paper. The overhead on submatch extraction in the generated lexer grows with the -number of tags \-\-\- if this number is moderate, the overhead is barely -noticeable. In the lexer tags are implemented using a number of tag variables -generated by re2c. There is no one\-to\-one correspondence between tag variables -and tags: a single variable may be reused for different tags, and one tag may -require multiple variables to hold all its ambiguous values. Eventually -ambiguity is resolved, and only one final variable per tag survives. When a rule -matches, all its tags are set to the values of the corresponding tag variables. -The exact number of tag variables is unknown to the user; this number is -determined by re2c. However, tag variables should be defined by the user as a -part of the lexer state and updated by \fBYYFILL\fP, therefore re2c provides -directives \fB/*!stags:re2c*/\fP and \fB/*!mtags:re2c*/\fP that can be used to -declare, initialize and manipulate tag variables. These directives have two -optional configurations: \fBformat = \(dq@@\(dq;\fP (specifies the template where \fB@@\fP -is substituted with the name of each tag variable), and \fBseparator = \(dq\(dq;\fP -(specifies the piece of code used to join the generated pieces for different -tag variables). +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. .sp S\-tags support the following operations: .INDENT 0.0 @@ -3106,7 +3115,7 @@ func main() () { .UNINDENT .UNINDENT .sp -Here is an example of using POSIX capturing groups to parse semantic versions. +Here is an example of using capturing groups to parse semantic versions. .INDENT 0.0 .INDENT 3.5 .sp @@ -3117,9 +3126,6 @@ package main import \(dqreflect\(dq -// Maximum number of capturing groups among all rules. -/*!maxnmatch:re2c*/ - type SemVer struct { major, minor, patch int } func s2n(s string) int { // convert pre\-parsed string to a number @@ -3131,9 +3137,8 @@ func s2n(s string) int { // convert pre\-parsed string to a number func parse(yyinput string) *SemVer { var yycursor, yymarker int - // Allocate memory for capturing parentheses (twice the number of groups). - yypmatch := make([]int, YYMAXNMATCH*2) - var yynmatch int + // Final tag variables used in semantic action. + /*!svars:re2c format = \(aqvar @@ int;\(aq; */ // Intermediate tag variables used by the lexer (must be autogenerated). /*!stags:re2c format = \(aqvar @@ int;\(aq; */ @@ -3142,20 +3147,16 @@ func parse(yyinput string) *SemVer { re2c:yyfill:enable = 0; re2c:api = default; re2c:define:YYCTYPE = byte; - re2c:posix\-captures = 1; + re2c:captvars = 1; num = [0\-9]+; (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { - // \(gayynmatch\(ga is the number of capturing groups - if yynmatch != 4 { panic(\(dqexpected 4 submatch groups\(dq) } - - // Even \(gayypmatch\(ga values are for opening parentheses, odd values - // are for closing parentheses, the first group is the whole match. - major := s2n(yyinput[yypmatch[2]:yypmatch[3]]) - minor := s2n(yyinput[yypmatch[4]:yypmatch[5]]) + _ = yytl0; _ = yytr0; // some variables are unused + major := s2n(yyinput[yytl1:yytr1]) + minor := s2n(yyinput[yytl2:yytr2]) patch := 0 - if yypmatch[6] != \-1 { patch = s2n(yyinput[yypmatch[6]+1:yypmatch[7]]) } + if yytl3 != \-1 { patch = s2n(yyinput[yytl3+1:yytr3]) } return &SemVer{major, minor, patch} } diff --git a/bootstrap/doc/re2hs.1 b/bootstrap/doc/re2hs.1 new file mode 100644 index 000000000..9c7ae4aaa --- /dev/null +++ b/bootstrap/doc/re2hs.1 @@ -0,0 +1,3544 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for Haskell, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-i +{\-# LANGUAGE OverloadedStrings #\-} +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} + +import Data.ByteString (ByteString, index) + +data State = State { + _yyinput :: ByteString, + _yycursor :: Int +} + +%{ + re2c:define:YYFN = [\(dqlexer;Bool\(dq, \(dqState{..};State\(dq]; + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + + number { True } + * { False } +%} + +main :: IO () +main = case lexer State{_yyinput = \(dq1234\e0\(dq, _yycursor = 0} of + True \-> return () + False \-> error \(dqlexer failed!\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- Generated by re2hs +{\-# LANGUAGE RecordWildCards #\-} +\-\- re2hs $INPUT \-o $OUTPUT \-i +{\-# LANGUAGE OverloadedStrings #\-} +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} + +import Data.ByteString (ByteString, index) + +data State = State { + _yyinput :: ByteString, + _yycursor :: Int +} + + +yy0 :: State \-> Bool +yy0 State{..} = + let yych = index _yyinput _yycursor in + let __ = _yycursor + 1 in let _yycursor = __ in + case yych of + _c | 0x31 <= _c && _c <= 0x39 \-> + yy2 State{..} + _c | True \-> + yy1 State{..} + +yy1 :: State \-> Bool +yy1 State{..} = + False + +yy2 :: State \-> Bool +yy2 State{..} = + let yych = index _yyinput _yycursor in + case yych of + _c | 0x30 <= _c && _c <= 0x39 \-> + let __ = _yycursor + 1 in let _yycursor = __ in + yy2 State{..} + _c | True \-> + yy3 State{..} + +yy3 :: State \-> Bool +yy3 State{..} = + True + +lexer :: State \-> Bool +lexer State{..} = + yy0 State{..} + + + +main :: IO () +main = case lexer State{_yyinput = \(dq1234\e0\(dq, _yycursor = 0} of + True \-> return () + False \-> error \(dqlexer failed!\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2hs has two API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBRecord API\fP +Record API is the default API for the Haskell backend. +This API consists of a binding \fByyrecord\fP (the name can be overridden with +\fBre2c:variable:yyrecord\fP) that should be defined as a record with fields +\fB_yyinput\fP, \fB_yycursor\fP, \fB_yymarker\fP, \fB_yyctxmarker\fP, \fB_yylimit\fP\&. +Only the fields used by the generated code need to be defined, and their +names can be configured. +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYCOPYSTAG\fP, +\fBYYCOPYMTAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +data State = State { + _yyinput :: ByteString, + _yycursor :: Int, + _count :: Int +} + +\-\- expect a null\-terminated string +%{ + re2c:define:YYFN = [\(dqlexer;Int\(dq, \(dqState{..};State\(dq]; + re2c:yyfill:enable = 0; + + * { (\-1) } + [\ex00] { _count } + [a\-z]+ { lexer State{_count = _count + 1, ..} } + [ ]+ { lexer State{..} } +%} + +main :: IO () +main = do + let test s n = when (lexer st /= n) $ error \(dqfailed\(dq + where st = State{_yyinput = s, _yycursor = 0, _count = 0} + test \(dq\e0\(dq 0 + test \(dqone two three\e0\(dq 3 + test \(dqf0ur\e0\(dq (\-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import qualified Data.ByteString as BS +import Data.Word + +data State = State { + _yyinput :: BS.ByteString, + _yycursor :: Int, + _yymarker :: Int, + _yylimit :: Int, + _count :: Int +} + +\-\- expect a null\-terminated string +%{ + re2c:define:YYFN = [\(dqlexer;Int\(dq, \(dqState{..};State\(dq]; + re2c:define:YYCTYPE = \(dqWord8\(dq; + re2c:define:YYPEEK = \(dqBS.index\(dq; + re2c:eof = 0; + re2c:yyfill:enable = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { (\-1) } + $ { _count } + str { lexer State{_count = _count + 1, ..} } + [ ]+ { lexer State{..} } +%} + +main :: IO () +main = do + let test s n = do + let st = State { + _yyinput = s, + _yycursor = 0, + _yymarker = 0, + _yylimit = BS.length s \- 1, \-\- terminating null not included + _count = 0} + + when (lexer st /= n) $ error \(dqfailed\(dq + + test \(dq\e0\(dq 0 + test \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \e0\(dq 3 + test \(dq\(aqunterminated\e\e\(aq\e0\(dq (\-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Exception +import Control.Monad (when) +import qualified Data.ByteString as BS + +data State = State { + _yyinput :: BS.ByteString, + _yycursor :: Int, + _yylimit :: Int, + _count :: Int +} + +data FillException = UnexpectedFill deriving (Show) +instance Exception FillException + +yymaxfill :: Int +%{max %} + +%{ + re2c:define:YYFN = [\(dqlexer;IO Int\(dq, \(dqState{..};State\(dq]; + re2c:define:YYPEEK = \(dqBS.index\(dq; + re2c:define:YYFILL = \(dqthrow UnexpectedFill\(dq; + re2c:monadic = 1; // YYFILL requires monadic do\-notation for \(gawhen\(ga conditions + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + \-\- check that it is the sentinel, not some unexpected null + return $ if _yycursor == BS.length _yyinput \- yymaxfill + 1 then _count else (\-1) + } + str { lexer State{_count = _count + 1, ..} } + [ ]+ { lexer State{..} } + * { return (\-1) } +%} + +main :: IO () +main = do + let test s n = do + let buf = BS.concat [s, BS.replicate yymaxfill 0] + let st = State { + _yyinput = buf, + _yycursor = 0, + _yylimit = BS.length buf, + _count = 0} + m <\- catch (lexer st) (\e(_ :: FillException) \-> return (\-2)) + when (m /= n) $ error \(dqfailed\(dq + + test \(dq\(dq 0 + test \(dq\(aqunterminated\e\e\(aq\(dq (\-2) + test \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq 3 + test \(dq\(aqunexpected\(aq \e0 \(aqnull\(aq\(dq (\-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import qualified Data.ByteString as BS + +data State = State { + _str :: BS.ByteString, + _cur :: Int, + _lim :: Int, + _cnt :: Int +} + +\-\- Expect a string without terminating null. +%{ + re2c:api = generic; + re2c:define:YYFN = [\(dqlexer;Int\(dq, \(dqState{..};State\(dq]; + re2c:define:YYPEEK = \(dqif _cur < _lim then BS.index _str _cur else 0\(dq; + re2c:define:YYSKIP = \(dqlet cur = _cur + 1 in let _cur = cur in\(dq; + re2c:yyfill:enable = 0; + + * { (\-1) } + [\ex00] { _cnt } + [a\-z]+ { lexer State{_cnt = _cnt + 1, ..} } + [ ]+ { lexer State{..} } +%} + +main :: IO () +main = do + let test s n = + let st = State {_str = s, _cur = 0, _lim = BS.length s, _cnt = 0} + in when (lexer st /= n) $ error \(dqfailed\(dq + + test \(dq\(dq 0 + test \(dqone two three \(dq 3 + test \(dqf0ur\(dq (\-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad +import Data.ByteString as BS +import Data.Word +import GHC.IO.Handle +import System.Directory +import System.IO + +chunk_size :: Int +chunk_size = 4096 + +data State = State { + _file :: !Handle, + _yyinput :: !BS.ByteString, + _yycursor :: !Int, + _yymarker :: !Int, + _yylimit :: !Int, + _token :: !Int, + _eof :: !Bool, + _count :: !Int +} + +%{ + re2c:define:YYFN = [\(dqlexer;IO Int\(dq, \(dqState{..};State;!State{..}\(dq]; + re2c:define:YYCTYPE = \(dqWord8\(dq; + re2c:define:YYPEEK = \(dqBS.index\(dq; + re2c:define:YYFILL = \(dq(State{..}, yyfill) <\- fill State{..}\(dq; + re2c:eof = 0; + re2c:monadic = 1; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return (\-1) } + $ { return _count } + str { lexer State{_token = _yycursor, _count = _count + 1, ..} } + [ ]+ { lexer State{_token = _yycursor, ..} } +%} + +fill :: State \-> IO (State, Bool) +fill State{..} = do + case _eof of + True \-> return (State{..}, False) + False \-> do + \-\- Discard everything up to the current token, cut off terminating null, + \-\- read new chunk from file and reappend terminating null at the end. + chunk <\- BS.hGet _file chunk_size + return (State { + _yyinput = BS.concat [(BS.init . BS.drop _token) _yyinput, chunk, \(dq\e0\(dq], + _yycursor = _yycursor \- _token, + _yymarker = _yymarker \- _token, + _yylimit = _yylimit \- _token + BS.length chunk, \-\- exclude terminating null + _token = 0, + _eof = BS.null chunk, \-\- end of file? + ..}, True) + +main :: IO () +main = do + let fname = \(dqinput\(dq + + \-\- Prepare input file. + BS.writeFile fname $ BS.concat [\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq | _ <\- [1..chunk_size]] + let expect = 3 * chunk_size \-\- the total number of strings in file + + \-\- Run lexer on the prepared file. + fh <\- openFile fname ReadMode + let st = State { + _file = fh, + _yyinput = BS.singleton 0, + _yycursor = 0, + _yymarker = 0, + _token = 0, + _yylimit = 0, + _eof = False, + _count = 0 + } + result <\- lexer st + hClose fh + + \-\- Cleanup. + removeFile fname + + \-\- Check result. + when (result /= expect) $ error $ \(dqexpected \(dq ++ show expect ++ \(dq, got \(dq ++ show result + return () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad +import qualified Data.ByteString as BS +import GHC.IO.Handle +import System.Directory +import System.IO + +chunk_size :: Int +chunk_size = 4096 + +data State = State { + _file :: !Handle, + _yyinput :: !BS.ByteString, + _yycursor :: !Int, + _yylimit :: !Int, + _token :: !Int, + _eof :: !Bool, + _count :: !Int +} + +%{ + re2c:define:YYFN = [\(dqlexer;IO Int\(dq, \(dqState{..};State;!State{..}\(dq]; + re2c:define:YYPEEK = \(dqBS.index\(dq; + + // We have to turn off autogenerated YFILL check and write it manually as part of YYFILL + // implementation, so that we can propagate the updated state out of it. + re2c:yyfill:check = 0; + re2c:define:YYFILL = \(dqState{..} <\- fill State{..} @@\(dq; + re2c:monadic = 1; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return (\-1) } + [\ex00] { return $ if _yycursor == _yylimit \- yymaxfill + 1 then _count else (\-1) } + str { lexer State{_token = _yycursor, _count = _count + 1, ..} } + [ ]+ { lexer State{_token = _yycursor, ..} } +%} + +yymaxfill :: Int +%{max %} + +fill :: State \-> Int \-> IO State +fill !st@State{..} !need = + if _yylimit \- _yycursor >= need then + return st + else case _eof of + True \-> error \(dqfill failed\(dq + False \-> do + \-\- Discard everything up to the current token, cut off terminating null, + \-\- read new chunk from file and reappend terminating null at the end. + chunk <\- BS.hGet _file chunk_size + let !eof = BS.length chunk < need \-\- end of file ? + let !buf = BS.concat [ + BS.drop _token _yyinput, + chunk, + if eof then (BS.replicate yymaxfill 0) else BS.empty] + return State { + _yyinput = buf, + _yycursor = _yycursor \- _token, + _yylimit = BS.length buf, + _token = 0, + _eof = eof, + ..} + +main :: IO () +main = do + let fname = \(dqinput\(dq + + \-\- Prepare input file. + BS.writeFile fname $ BS.concat [\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq | _ <\- [1..chunk_size]] + let expect = 3 * chunk_size \-\- the total number of strings in file + + \-\- Run lexer on the prepared file. + fh <\- openFile fname ReadMode + let st = State { + _file = fh, + _yyinput = BS.empty, + _yycursor = 0, + _token = 0, + _yylimit = 0, + _eof = False, + _count = 0 + } + result <\- lexer st + hClose fh + + \-\- Cleanup. + removeFile fname + + \-\- Check result. + when (result /= expect) $ error $ \(dqexpected \(dq ++ show expect ++ \(dq, got \(dq ++ show result + return () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-i +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +data State = State { + _yyinput :: !ByteString, + _yycursor :: !Int, + _yymarker :: !Int +} + +peek_digit :: ByteString \-> Int \-> Int \-> Int +peek_digit str idx offs = fromIntegral (index str (idx \- 1)) \- offs + +%{ + re2c:yyfill:enable = 0; +%} + +%{local + re2c:define:YYFN = [\(dqparse_bin;Int\(dq, \(dqState{..};State\(dq, \(dqnum;Int\(dq]; + [01] { parse_bin State{..} $ num * 2 + (peek_digit _yyinput _yycursor 48) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse_oct;Int\(dq, \(dqState{..};State\(dq, \(dqnum;Int\(dq]; + [0\-7] { parse_oct State{..} $ num * 8 + (peek_digit _yyinput _yycursor 48) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse_dec;Int\(dq, \(dqState{..};State\(dq, \(dqnum;Int\(dq]; + [0\-9] { parse_dec State{..} $ num * 10 + (peek_digit _yyinput _yycursor 48) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse_hex;Int\(dq, \(dqState{..};State\(dq, \(dqnum;Int\(dq]; + [0\-9] { parse_hex State{..} $ num * 16 + (peek_digit _yyinput _yycursor 48) } + [a\-f] { parse_hex State{..} $ num * 16 + (peek_digit _yyinput _yycursor 87) } + [A\-F] { parse_hex State{..} $ num * 16 + (peek_digit _yyinput _yycursor 55) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse;Maybe Int\(dq, \(dqState{..};State\(dq]; + \(aq0b\(aq / [01] { Just $ parse_bin State{..} 0 } + \(dq0\(dq { Just $ parse_oct State{..} 0 } + \(dq\(dq / [1\-9] { Just $ parse_dec State{..} 0 } + \(aq0x\(aq / [0\-9a\-fA\-F] { Just $ parse_hex State{..} 0 } + * { Nothing } +%} + +test :: ByteString \-> Maybe Int \-> IO () +test str expect = do + let s = State {_yyinput = str, _yycursor = 0, _yymarker = 0} + when (parse s /= expect) $ error \(dqfailed!\(dq + +main :: IO () +main = do + test \(dq\e0\(dq Nothing + test \(dq1234567890\e0\(dq (Just 1234567890) + test \(dq0b1101\e0\(dq (Just 13) + test \(dq0x7Fe\e0\(dq (Just 2046) + test \(dq0644\e0\(dq (Just 420) + test \(dq9223372036854775807\e0\(dq (Just 9223372036854775807) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-ci +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +%{conditions %} + +data State = State { + _yyinput :: !ByteString, + _yycursor :: !Int, + _yymarker :: !Int, + _yycond :: !YYCONDTYPE +} + +peek_digit :: ByteString \-> Int \-> Int \-> Int +peek_digit str idx offs = fromIntegral (index str (idx \- 1)) \- offs + +%{ + re2c:define:YYFN = [\(dqparse;Maybe Int\(dq, \(dqState{..};State\(dq, \(dq_num;Int\(dq]; + re2c:yyfill:enable = 0; + + \(aq0b\(aq / [01] :=> bin + \(dq0\(dq :=> oct + \(dq\(dq / [1\-9] :=> dec + \(aq0x\(aq / [0\-9a\-fA\-F] :=> hex + * { Nothing } + + [01] { yyfnbin State{..} $ _num * 2 + (peek_digit _yyinput _yycursor 48) } + [0\-7] { yyfnoct State{..} $ _num * 8 + (peek_digit _yyinput _yycursor 48) } + [0\-9] { yyfndec State{..} $ _num * 10 + (peek_digit _yyinput _yycursor 48) } + [0\-9] { yyfnhex State{..} $ _num * 16 + (peek_digit _yyinput _yycursor 48) } + [a\-f] { yyfnhex State{..} $ _num * 16 + (peek_digit _yyinput _yycursor 87) } + [A\-F] { yyfnhex State{..} $ _num * 16 + (peek_digit _yyinput _yycursor 55) } + + * { Just _num } +%} + +test :: ByteString \-> Maybe Int \-> IO () +test str expect = do + let s = State { + _yyinput = str, + _yycursor = 0, + _yymarker = 0, + _yycond = YYC_init} + when (parse s 0 /= expect) $ error \(dqfailed!\(dq + +main :: IO () +main = do + test \(dq\e0\(dq Nothing + test \(dq1234567890\e0\(dq (Just 1234567890) + test \(dq0b1101\e0\(dq (Just 13) + test \(dq0x7Fe\e0\(dq (Just 2046) + test \(dq0644\e0\(dq (Just 420) + test \(dq9223372036854775807\e0\(dq (Just 9223372036854775807) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-fi +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Concurrent.Chan +import Control.Monad +import Data.ByteString as BS +import Text.Printf + +debug :: IO () \-> IO () +debug = when False + +data State = State { + _pipe :: !(Chan BS.ByteString), + _yyinput :: !BS.ByteString, + _yycursor :: !Int, + _yymarker :: !Int, + _yylimit :: !Int, + _token :: !Int, + _eof :: !Bool, + _yystate :: !Int, + _recv :: !Int +} + +data Status = End | Ready | Waiting | BadPacket deriving (Eq) + +%{ + re2c:define:YYFN = [\(dqlexer;IO (State, Status)\(dq, \(dqState{..};State;!State{..}\(dq]; + re2c:define:YYPEEK = \(dqBS.index\(dq; + re2c:define:YYFILL = \(dqreturn (State{..}, Waiting)\(dq; + re2c:eof = 0; + re2c:monadic = 1; + + packet = [a\-z]+[;]; + + * { return (State{..}, BadPacket) } + $ { return (State{..}, End) } + packet { lexer State{_token = _yycursor, _recv = _recv + 1, ..} } +%} + +fill :: State \-> IO (State, Status) +fill st@State{..} = do + case _eof of + True \-> return (st, End) + False \-> do + \-\- Discard everything up to the current token, cut off terminating null, + \-\- read new chunk from file and reappend terminating null at the end. + chunk <\- readChan _pipe + return (State { + _yyinput = BS.concat [(BS.init . BS.drop _token) _yyinput, chunk, \(dq\e0\(dq], + _yycursor = _yycursor \- _token, + _yymarker = _yymarker \- _token, + _yylimit = _yylimit \- _token + BS.length chunk, \-\- exclude terminating null + _token = 0, + _eof = BS.null chunk, \-\- end of file? + ..}, Ready) + +loop :: State \-> [BS.ByteString] \-> IO Status +loop State{..} packets = do + (State{..}, status) <\- lexer State{..} + case status of + End \-> do + debug $ printf \(dqdone: got %d packets\en\(dq _recv + return End + Waiting \-> do + debug $ printf \(dqwaiting...\en\(dq + packets\(aq <\- case packets of + [] \-> do + writeChan _pipe BS.empty + return [] + p:ps \-> do + debug $ printf \(dqsent packet \(aq%s\(aq\en\(dq (show p) + writeChan _pipe p + return ps + (State{..}, status\(aq) <\- fill State{..} + case status\(aq of + Ready \-> loop State{..} packets\(aq + _ \-> error \(dqunexpected status after fill\(dq + BadPacket \-> do + debug $ printf \(dqerror: ill\-formed packet\en\(dq + return BadPacket + _ \-> error \(dqunexpected status\(dq + +test :: [BS.ByteString] \-> Status \-> IO () +test packets expect = do + pipe <\- newChan \-\- emulate pipe using a chan of bytestrings + let st = State { + _pipe = pipe, + _yyinput = BS.singleton 0, \-\- null sentinel triggers YYFILL + _yycursor = 0, + _yymarker = 0, + _token = 0, + _yylimit = 0, + _eof = False, + _yystate = \-1, + _recv = 0 + } + status <\- loop st packets + when (status /= expect) $ error \(dqfailed\(dq + return () + +main :: IO () +main = do + test [] End + test [\(dqze\(dq, \(dqro;o\(dq, \(dqne\(dq, \(dq;t\(dq, \(dqwo;thr\(dq, \(dqe\(dq, \(dqe\(dq, \(dq;\(dq, \(dqfour;\(dq] End + test [\(dqzer0;\(dq] BadPacket + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +\-\- This example shows how to combine reusable re2c blocks: two blocks +\-\- (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +\-\- in both blocks; the \(aqfish\(aq block takes priority because it is used +\-\- earlier. Default rule * occurs in all three blocks; the local (not +\-\- inherited) definition takes priority. + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +data Answer = Color | Fish | Dunno deriving (Eq) + +data State = State { + _yyinput :: ByteString, + _yycursor :: Int, + _yymarker :: Int +} + +%{rules:colors + * { error \(dqah\(dq } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { Color } +%} + +%{rules:fish + * { error \(dqoh\(dq } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { Fish } +%} + +%{ + re2c:define:YYFN = [\(dqlexer;Answer\(dq, \(dqState{..};State\(dq]; + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { Dunno } // overrides inherited \(aq*\(aq rules +%} + +main :: IO () +main = do + let test str ans = do + let st = State {_yyinput = str, _yycursor = 0, _yymarker = 0} + when (lexer st /= ans) $ error \(dqfailed\(dq + + test \(dqsalmon\(dq Fish + test \(dqwhat?\(dq Dunno + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-\-input\-encoding utf8 +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +\-\- This example supports multiple input encodings: UTF\-8 and UTF\-32. +\-\- Both lexers are generated from the same rules block, and the use +\-\- blocks add only encoding\-specific configurations. + +import Control.Monad (when) +import Data.Array +import Data.Word + +data State a = State { + _yyinput :: a, + _yycursor :: Int, + _yymarker :: Int +} + +%{rules + re2c:yyfill:enable = 0; + re2c:define:YYPEEK = \(dq(!)\(dq; + + \(dq∀x ∃y\(dq { Just _yycursor } + * { Nothing } +%} + +%{use + re2c:define:YYFN = [\(dqlex8;Maybe Int\(dq, \(dqState{..};State (Array Int Word8)\(dq]; + re2c:encoding:utf8 = 1; + re2c:define:YYCTYPE = Word8; +%} + +%{use + re2c:define:YYFN = [\(dqlex32;Maybe Int\(dq, \(dqState{..};State (Array Int Int)\(dq]; + re2c:encoding:utf32 = 1; + re2c:define:YYCTYPE = Int; +%} + +main :: IO () +main = do + let make_st l = State { + _yyinput = listArray (0, length l \- 1) l, + _yycursor = 0, + _yymarker = 0} + + let s8 = [0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79] + when (lex8 (make_st s8) /= Just (length s8)) $ error \(dqlex8 failed\(dq + + let s32 = [0x2200, 0x78, 0x20, 0x2203, 0x79] + when (lex32 (make_st s32) /= Just (length s32)) $ error \(dqlex32 failed\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +data State = State { + \-\- Final tag bindings available in semantic action. + %{svars format = \(dq\en@@ :: !Int,\(dq; %} + \-\- Intermediate tag bindings used by the lexer (must be autogenerated). + %{stags format = \(dq\en@@ :: !Int,\(dq; %} + _yyinput :: !ByteString, + _yycursor :: !Int, + _yymarker :: !Int +} + +data SemVer = SemVer { + major :: !Int, + minor :: !Int, + patch :: !Int +} deriving (Eq) + +s2n :: ByteString \-> Int \-> Int \-> Int +s2n s i j = f i 0 where + f k n = if k >= j then n else f (k + 1) (n * 10 + (fromIntegral (index s k) \- 48)) + +%{ + re2c:define:YYFN = [\(dqparse;Maybe SemVer\(dq, \(dqState{..};State\(dq]; + re2c:tags = 1; + re2c:yyfill:enable = 0; + + num = [0\-9]+; + + @_1 num @_2 \(dq.\(dq @_3 num @_4 (\(dq.\(dq @_5 num)? [\ex00] { + Just SemVer { + major = s2n _yyinput _1 _2, + minor = s2n _yyinput _3 _4, + patch = if _5 == (\-1) then 0 else s2n _yyinput _5 (_yycursor \- 1) + } + } + * { Nothing } +%} + +test :: ByteString \-> Maybe SemVer \-> IO () +test str expect = do + let s = State { + %{svars format = \(dq\en@@ = (\-1),\(dq; %} + %{stags format = \(dq\en@@ = (\-1),\(dq; %} + _yyinput = str, + _yycursor = 0, + _yymarker = 0 + } + when (parse s /= expect) $ error \(dqfailed!\(dq + +main :: IO () +main = do + test \(dq23.34\e0\(dq (Just SemVer {major = 23, minor = 34, patch = 0}) + test \(dq1.2.99999\e0\(dq (Just SemVer {major = 1, minor = 2, patch = 99999}) + test \(dq1.a\e0\(dq Nothing + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad +import Data.ByteString as BS +import GHC.IO.Handle +import System.Directory +import System.IO + +chunk_size :: Int +chunk_size = 4096 + +data State = State { + _file :: !Handle, + _yyinput :: !BS.ByteString, + _yycursor :: !Int, + _yymarker :: !Int, + _yylimit :: !Int, + _token :: !Int, + \-\- Final tag bindings available in semantic action. + %{svars format = \(dq\en@@ :: !Int,\(dq; %} + \-\- Intermediate tag bindings used by the lexer (must be autogenerated). + %{stags format = \(dq\en@@ :: !Int,\(dq; %} + _eof :: !Bool +} + +data SemVer = SemVer { + major :: !Int, + minor :: !Int, + patch :: !Int +} deriving (Eq, Show) + +s2n :: BS.ByteString \-> Int \-> Int \-> Int +s2n s i j = f i 0 where + f k n = if k >= j then n else f (k + 1) (n * 10 + (fromIntegral (BS.index s k) \- 48)) + +%{ + re2c:define:YYFN = [\(dqlexer;IO [SemVer]\(dq, \(dqState{..};State\(dq, \(dq_vers;[SemVer]\(dq]; + re2c:define:YYPEEK = \(dqBS.index\(dq; + re2c:define:YYFILL = \(dq(State{..}, yyfill) <\- fill State{..}\(dq; + re2c:eof = 0; + re2c:monadic = 1; + re2c:tags = 1; + + num = [0\-9]+; + + @_1 num @_2 \(dq.\(dq @_3 num @_4 (\(dq.\(dq @_5 num)? [\en] { + let ver = SemVer { + major = s2n _yyinput _1 _2, + minor = s2n _yyinput _3 _4, + patch = if _5 == (\-1) then 0 else s2n _yyinput _5 (_yycursor \- 1) + } + lexer State{..} (ver: _vers) + } + $ { return _vers } + * { error \(dqlexer failed\(dq } +%} + +fill :: State \-> IO (State, Bool) +fill State{..} = do + case _eof of + True \-> return (State{..}, False) + False \-> do + \-\- Discard everything up to the current token, cut off terminating null, + \-\- read new chunk from file and reappend terminating null at the end. + chunk <\- BS.hGet _file chunk_size + return (State{ + _yyinput = BS.concat [(BS.init . BS.drop _token) _yyinput, chunk, \(dq\e0\(dq], + _yycursor = _yycursor \- _token, + _yymarker = _yymarker \- _token, + _yylimit = _yylimit \- _token + BS.length chunk, \-\- exclude terminating null + _token = 0, + _eof = BS.null chunk, \-\- end of file? + ..}, True) + +main :: IO () +main = do + let fname = \(dqinput\(dq + + \-\- Prepare input file. + BS.writeFile fname $ BS.concat [\(dq1.22.333\en\(dq | _ <\- [1..chunk_size]] + let expect = [SemVer {major = 1, minor = 22, patch = 333} | _ <\- [1..chunk_size]] + + \-\- Run lexer on the prepared file. + fh <\- openFile fname ReadMode + let st = State { + _file = fh, + _yyinput = BS.singleton 0, + _yycursor = 0, + _yymarker = 0, + _yylimit = 0, + _token = 0, + %{svars format = \(dq\en@@ = (\-1),\(dq; %} + %{stags format = \(dq\en@@ = (\-1),\(dq; %} + _eof = False + } + result <\- lexer st [] + hClose fh + + \-\- Cleanup. + removeFile fname + + \-\- Check result. + when (result /= expect) $ error $ \(dqexpected \(dq ++ show expect ++ \(dq, got \(dq ++ show result + return () + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) +import Data.Word (Word8) + +none :: Int +none = \-1 + +data State = State { + \-\- Final tag bindings available in semantic action. + %{svars format = \(dq\en@@ :: !Int,\(dq; %} + \-\- Intermediate tag bindings used by the lexer (must be autogenerated). + %{stags format = \(dq\en@@ :: !Int,\(dq; %} + _yyinput :: !ByteString, + _yycursor :: !Int, + _yymarker :: !Int +} + +data SemVer = SemVer { + major :: !Int, + minor :: !Int, + patch :: !Int +} deriving (Eq) + +s2n :: ByteString \-> Int \-> Int \-> Int +s2n s i j = f i 0 where + f k n = if k >= j then n else f (k + 1) (n * 10 + (fromIntegral (index s k) \- 48)) + +%{ + re2c:define:YYFN = [\(dqparse;Maybe SemVer\(dq, \(dqState{..};State\(dq]; + re2c:define:YYCTYPE = \(dqWord8\(dq; + re2c:captvars = 1; + re2c:variable:yypmatch = _; + re2c:yyfill:enable = 0; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + Just SemVer { + major = s2n _yyinput _yytl1 _yytr1, + minor = s2n _yyinput _yytl2 _yytr2, + patch = if _yytl3 == none then 0 else s2n _yyinput (_yytl3 + 1) _yytr3 + } + } + * { Nothing } +%} + +test :: ByteString \-> Maybe SemVer \-> IO () +test str expect = do + let s = State { + %{svars format = \(dq\en@@ = none,\(dq; %} + %{stags format = \(dq\en@@ = none,\(dq; %} + _yyinput = str, + _yycursor = 0, + _yymarker = 0 + } + when (parse s /= expect) $ error \(dqfailed!\(dq + +main :: IO () +main = do + test \(dq23.34\e0\(dq (Just SemVer {major = 23, minor = 34, patch = 0}) + test \(dq1.2.99999\e0\(dq (Just SemVer {major = 1, minor = 2, patch = 99999}) + test \(dq1.a\e0\(dq Nothing + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +data State = State { + \-\- Final tag bindings available in semantic action. + %{svars format = \(dq\en@@ :: !Int,\(dq; %} + %{mvars format = \(dq\en@@ :: ![Int],\(dq; %} + \-\- Intermediate tag bindings used by the lexer (must be autogenerated). + %{stags format = \(dq\en@@ :: !Int,\(dq; %} + %{mtags format = \(dq\en@@ :: ![Int],\(dq; %} + _yyinput :: !ByteString, + _yycursor :: !Int, + _yymarker :: !Int +} + +s2n :: ByteString \-> Int \-> Int \-> Int +s2n s i j = f i 0 where + f k n = if k >= j then n else f (k + 1) (n * 10 + (fromIntegral (index s k) \- 48)) + +%{ + re2c:define:YYFN = [\(dqparse;Maybe [Int]\(dq, \(dqState{..};State\(dq]; + re2c:define:YYMTAGP = \(dqlet tag = _yycursor : @@{tag} in let @@{tag} = tag in\(dq; + re2c:define:YYMTAGN = \(dq\(dq; // alternatively could add \-1 to the list + re2c:tags = 1; + re2c:yyfill:enable = 0; + + num = [0\-9]+; + + @_1 num @_2 (\(dq.\(dq #_3 num #_4)* [\ex00] { + Just $ (s2n _yyinput _1 _2) : (reverse $ zipWith (\ei j \-> s2n _yyinput i j) _3 _4) + } + * { Nothing } +%} + +test :: ByteString \-> Maybe [Int] \-> IO () +test str expect = do + let st = State { + %{svars format = \(dq\en@@ = (\-1),\(dq; %} + %{stags format = \(dq\en@@ = (\-1),\(dq; %} + %{mvars format = \(dq\en@@ = [],\(dq; %} + %{mtags format = \(dq\en@@ = [],\(dq; %} + _yyinput = str, + _yycursor = 0, + _yymarker = 0 + } + when (parse st /= expect) $ error \(dqfailed!\(dq + +main :: IO () +main = do + test \(dq1\e0\(dq (Just [1]) + test \(dq1.2.3.4.5.6.7\e0\(dq (Just [1, 2, 3, 4, 5, 6, 7]) + test \(dq1.2.\e0\(dq Nothing + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-\-utf8 \-i +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +%{include \(dqunicode_categories.re\(dq %} + +data State = State { + _yyinput :: ByteString, + _yycursor :: Int, + _yymarker :: Int, + _yyaccept :: Int +} + +%{ + re2c:define:YYFN = [\(dqlexer;Bool\(dq, \(dqState{..};State\(dq]; + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + + identifier { True } + * { False } +%} + +main :: IO () +main = do + let st = State { + _yyinput = \(dq_Ыдентификатор\ex00\(dq, + _yycursor = 0, + _yymarker = 0, + _yyaccept = 0} + + when (not $ lexer st) $ error \(dqfailed\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.hs) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +data Number = INum | FNum | NNaN deriving (Eq) + +%{ + number = [1\-9][0\-9]*; +%} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { FNum } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-i +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (ByteString, index) + +%{include \(dqdefinitions.hs\(dq %} + +data State = State { + _yyinput :: ByteString, + _yycursor :: Int, + _yymarker :: Int, + _yyaccept :: Int +} + +%{ + re2c:define:YYFN = [\(dqlexer;Number\(dq, \(dqState{..};State\(dq]; + re2c:yyfill:enable = 0; + + * { NNaN } + number { INum } + !include \(dqextra_rules.re.inc\(dq; +%} + +main :: IO () +main = do + let test s n = do + let st = State { + _yyinput = s, + _yycursor = 0, + _yymarker = 0, + _yyaccept = 0} + + when (lexer st /= n) $ error \(dqfailed\(dq + + test \(dq123\e0\(dq INum + test \(dq123.4567\e0\(dq FNum + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- re2hs $INPUT \-o $OUTPUT \-\-header lexer/state.hs \-i +{\-# OPTIONS_GHC \-Wno\-unused\-record\-wildcards #\-} +{\-# LANGUAGE OverloadedStrings #\-} + +import Control.Monad (when) +import Data.ByteString (index) +import State + +%{header:on %} +module State where + +import Data.ByteString (ByteString) + +data State = State { + _yyinput :: !ByteString, + _yycursor :: !Int, + %{stags format = \(dq\en@@{tag} :: !Int,\(dq; %} + _tag :: !Int +} +%{header:off %} + +%{ + re2c:define:YYFN = [\(dqlexer;Int\(dq, \(dqState{..};State\(dq]; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:header = \(dqlexer/state.hs\(dq; + + [a]* @_tag [b]* { _tag } +%} + +main :: IO () +main = do + let s = State { + _yyinput = \(dqab\e0\(dq, + _yycursor = 0, + %{stags format = \(dq\en@@{tag} = \-1,\(dq; %} + _tag = 0} + + when (lexer s /= 1) $ error \(dqfailed!\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +\-\- Generated by re2c +{\-# LANGUAGE RecordWildCards #\-} + +module State where + +import Data.ByteString (ByteString) + +data State = State { + _yyinput :: !ByteString, + _yycursor :: !Int, + +_yyt1 :: !Int, + _tag :: !Int +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/doc/re2java.1 b/bootstrap/doc/re2java.1 new file mode 100644 index 000000000..93cdfccd8 --- /dev/null +++ b/bootstrap/doc/re2java.1 @@ -0,0 +1,3556 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for Java, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +class Main { + static boolean lex(String yyinput) { + int yycursor = 0; + + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + + number { return true; } + * { return false; } + */ + } + + public static void main(String []args) { + assert lex(\(dq1234\e0\(dq); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2java +// re2java $INPUT \-o $OUTPUT + +class Main { + static boolean lex(String yyinput) { + int yycursor = 0; + + +{ + char yych = 0; + int yystate = 0; + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charAt(yycursor); + yycursor += 1; + switch (yych) { + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yystate = 2; + continue yyl; + default: + yystate = 1; + continue yyl; + } + case 1: + { return false; } + case 2: + yych = yyinput.charAt(yycursor); + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 2; + continue yyl; + default: + yystate = 3; + continue yyl; + } + case 3: + { return true; } + default: + throw new IllegalStateException(\(dqinternal lexer error\(dq); + } + } +} + + } + + public static void main(String []args) { + assert lex(\(dq1234\e0\(dq); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2java has three API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBSimple API\fP +This is the default API for the Java backend. It consists of the following +primitives: \fBYYINPUT\fP (which should be defined as a sequence of code +units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, +\fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). +.nf + +.fi +.sp +.TP +.B \fBRecord API\fP +Record API is useful in cases when lexer state must be stored in a class. +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the +name can be overridden with \fBre2c:variable:yyrecord\fP) that should be +defined as a class with fields \fByyinput\fP, \fByycursor\fP, \fByymarker\fP, +\fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code +need to be defined, and their names can be configured). +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +class Main { + // Expects a null\-terminated string. + static int lex(String yyinput) { + int yycursor = 0; + int count = 0; + + loop: while (true) { + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + + * { return \-1; } + [\ex00] { return count; } + [a\-z]+ { count += 1; continue loop; } + [ ]+ { continue loop; } + */ + } + } + + public static void main(String []args) { + assert lex(\(dq\e0\(dq) == 0; + assert lex(\(dqone two three\e0\(dq) == 3; + assert lex(\(dqf0ur\e0\(dq) == \-1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +class Main { + // Expects a null\-terminated string. + static int lex(String yyinput) { + int yycursor = 0; + int yymarker = 0; + int yylimit = yyinput.length() \- 1; // yylimit points at the terminating null + int count = 0; + + loop: while (true) { + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1; } + $ { return count; } + str { count += 1; continue loop; } + [ ]+ { continue loop; } + */ + } + } + + public static void main(String []args) { + assert lex(\(dq\e0\(dq) == 0; + assert lex(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \e0\(dq) == 3; + assert lex(\(dq\(aqunterminated\e\e\(aq\e0\(dq) == \-1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +class Main { + /*!max:re2c*/ + + // Expects yymaxfill\-padded string. + static int lex(String str) { + // Pad string with yymaxfill zeroes at the end. + byte[] yyinput = new byte[str.length() + YYMAXFILL]; + System.arraycopy(str.getBytes(), 0, yyinput, 0, str.length()); + + int yycursor = 0; + int yylimit = yyinput.length; + int count = 0; + + loop: while (true) { + /*!re2c + re2c:define:YYCTYPE = \(dqbyte\(dq; + re2c:define:YYPEEK = \(dqyyinput[yycursor]\(dq; + re2c:define:YYFILL = \(dqreturn \-1;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return (yycursor \- 1 == str.length()) ? count : \-1; + } + str { count += 1; continue loop; } + [ ]+ { continue loop; } + * { return \-1; } + */ + } + } + + public static void main(String []args) { + assert lex(\(dq\(dq) == 0; + assert lex(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq) == 3; + assert lex(\(dq\(aqunterminated\e\e\(aq\(dq) == \-1; + assert lex(\(dq\(aqunexpected \e00 null\e\e\(aq\(dq) == \-1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +class Main { + // Expects a string without terminating null. + static int lex(String str) { + byte[] yyinput = str.getBytes(); + int yycursor = 0; + int count = 0; + + loop: while (true) { + /*!re2c + re2c:api = generic; + re2c:define:YYCTYPE = \(dqbyte\(dq; + re2c:define:YYPEEK = \(dq(yycursor < yyinput.length) ? yyinput[yycursor] : 0\(dq; + re2c:define:YYSKIP = \(dqyycursor += 1;\(dq; + re2c:yyfill:enable = 0; + + * { return \-1; } + [\ex00] { return count; } + [a\-z]+ { count += 1; continue loop; } + [ ]+ { continue loop; } + */ + } + } + + public static void main(String []args) { + assert lex(\(dq\(dq) == 0; + assert lex(\(dqone two three\(dq) == 3; + assert lex(\(dqf0ur\(dq) == \-1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +import java.io.*; +import java.nio.file.*; + +class Lexer { + public static final int BUFSIZE = 4096; + + private BufferedInputStream stream; + private byte[] yyinput; + private int yycursor; + private int yymarker; + private int yylimit; + private int token; + private boolean eof; + + public Lexer(File file) throws FileNotFoundException { + stream = new BufferedInputStream(new FileInputStream(file)); + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + yyinput = new byte[BUFSIZE + 1]; + yycursor = yymarker = yylimit = token = BUFSIZE; + eof = false; + } + + private int fill() throws IOException { + if (eof) { return \-1; } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (token < 1) { return \-2; } + + // Shift buffer contents (discard everything up to the current token). + System.arraycopy(yyinput, token, yyinput, 0, yylimit \- token); + yycursor \-= token; + yymarker \-= token; + yylimit \-= token; + token = 0; + + // Fill free space at the end of buffer with new data from file. + yylimit += stream.read(yyinput, yylimit, BUFSIZE \- yylimit); + yyinput[yylimit] = 0; // append sentinel symbol + + // If read less than expected, this is the end of input. + eof = yylimit < BUFSIZE; + + return 0; + } + + // Expects a null\-terminated string. + public int lex() throws IOException { + int count = 0; + loop: while (true) { + token = yycursor; + /*!re2c + re2c:define:YYCTYPE = \(dqbyte\(dq; + re2c:define:YYPEEK = \(dqyyinput[yycursor]\(dq; + re2c:define:YYFILL = \(dqfill() == 0\(dq; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1; } + $ { return count; } + str { count += 1; continue loop; } + [ ]+ { continue loop; } + */ + } + } + + public static void main(String []args) throws FileNotFoundException, IOException { + String fname = \(dqinput\(dq; + String content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq.repeat(Lexer.BUFSIZE); + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + Files.writeString(Paths.get(fname), content); + + int count = 3 * Lexer.BUFSIZE; // number of quoted strings written to file + + // Prepare lexer state: all offsets are at the end of buffer. + File file = new File(\(dq.\(dq, fname); + Lexer lexer = new Lexer(file); + + // Run the lexer. + int n = lexer.lex(); + assert n == count; + + // Cleanup: remove input file. + file.delete(); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +import java.io.*; +import java.nio.file.*; +import java.util.Arrays; + +class Lexer { + /*!max:re2c*/ + public static final int BUFSIZE = 4096; + + private BufferedInputStream stream; + private byte[] yyinput; + private int yycursor; + private int yylimit; + private int token; + private boolean eof; + + public Lexer(File file) throws FileNotFoundException { + stream = new BufferedInputStream(new FileInputStream(file)); + // Prepare lexer state: all offsets are at the end of buffer. + // This immediately triggers YYFILL, as the YYLESSTHAN condition is true. + yyinput = new byte[BUFSIZE + YYMAXFILL]; + yycursor = yylimit = token = BUFSIZE; + eof = false; + } + + private int fill(int need) throws IOException { + if (eof) { return \-1; } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (token < need) { return \-2; } + + // Shift buffer contents (discard everything up to the current token). + System.arraycopy(yyinput, token, yyinput, 0, yylimit \- token); + yycursor \-= token; + yylimit \-= token; + token = 0; + + // Fill free space at the end of buffer with new data from file. + yylimit += stream.read(yyinput, yylimit, BUFSIZE \- yylimit); + yyinput[yylimit] = 0; // append sentinel symbol + + // If read less than expected, this is the end of input. + if (yylimit < BUFSIZE) { + eof = true; + Arrays.fill(yyinput, yylimit, yylimit + YYMAXFILL, (byte)0); + yylimit += YYMAXFILL; + } + + return 0; + } + + // Expects a null\-terminated string. + public int lex() throws IOException { + int count = 0; + loop: while (true) { + token = yycursor; + /*!re2c + re2c:define:YYCTYPE = \(dqbyte\(dq; + re2c:define:YYPEEK = \(dqyyinput[yycursor]\(dq; + re2c:define:YYFILL = \(dqif (fill(@@) != 0) { return \-2; }\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return (token == yylimit \- YYMAXFILL) ? count : \-1; + } + str { count += 1; continue loop; } + [ ]+ { continue loop; } + * { return \-1; } + */ + } + } + + public static void main(String []args) throws FileNotFoundException, IOException { + String fname = \(dqinput\(dq; + String content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq.repeat(Lexer.BUFSIZE); + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + Files.writeString(Paths.get(fname), content); + + int count = 3 * Lexer.BUFSIZE; // number of quoted strings written to file + + // Prepare lexer state: all offsets are at the end of buffer. + File file = new File(\(dq.\(dq, fname); + Lexer lexer = new Lexer(file); + + // Run the lexer. + int n = lexer.lex(); + assert n == count; + + // Cleanup: remove input file. + file.delete(); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +class Parser { + private String yyinput; + private int yycursor; + private int yymarker; + private int number; + + private void add_digit(int base, int offset) throws ArithmeticException { + number = Math.addExact( + Math.multiplyExact(number, base), + yyinput.charAt(yycursor \- 1) \- offset); + } + + public int parse(String str) throws ArithmeticException, IllegalArgumentException { + yyinput = str; + yycursor = 0; + number = 0; + + try { + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + + end = \(dq\ex00\(dq; + + \(aq0b\(aq / [01] { return parse_bin(); } + \(dq0\(dq { return parse_oct(); } + \(dq\(dq / [1\-9] { return parse_dec(); } + \(aq0x\(aq / [0\-9a\-fA\-F] { return parse_hex(); } + * { throw new IllegalArgumentException(\(dqnot a number\(dq); } + */ + } catch (Exception e) { + return \-1; + } + } + + private int parse_bin() throws ArithmeticException, IllegalArgumentException { + /*!re2c + end { return number; } + [01] { add_digit(2, 48); return parse_bin(); } + * { throw new IllegalArgumentException(\(dqill\-formed binary number\(dq); } + */ + } + + private int parse_oct() throws ArithmeticException, IllegalArgumentException { + /*!re2c + end { return number; } + [0\-7] { add_digit(8, 48); return parse_oct(); } + * { throw new IllegalArgumentException(\(dqill\-formed octal number\(dq); } + */ + } + + private int parse_dec() throws ArithmeticException, IllegalArgumentException { + /*!re2c + end { return number; } + [0\-9] { add_digit(10, 48); return parse_dec(); } + * { throw new IllegalArgumentException(\(dqill\-formed decimal number\(dq); } + */ + } + + private int parse_hex() throws ArithmeticException, IllegalArgumentException { + /*!re2c + end { return number; } + [0\-9] { add_digit(16, 48); return parse_hex(); } + [a\-f] { add_digit(16, 87); return parse_hex(); } + [A\-F] { add_digit(16, 55); return parse_hex(); } + * { throw new IllegalArgumentException(\(dqill\-formed hexadecimal number\(dq); } + */ + } + + public static void main(String []args) { + Parser parser = new Parser(); + assert parser.parse(\(dq1234567890\e0\(dq) == 1234567890; + assert parser.parse(\(dq0b1101\e0\(dq) == 0b1101; + assert parser.parse(\(dq0x007Fe\e0\(dq) == 0x7fe; + assert parser.parse(\(dq0644\e0\(dq) == 0644; + assert parser.parse(\(dq9999999999\e0\(dq) == \-1; + assert parser.parse(\(dq123??\e0\(dq) == \-1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT \-c + +class Parser { + /*!conditions:re2c*/ + private String yyinput; + private int yycursor; + private int yymarker; + private int number; + + private void add_digit(int base, int offset) throws ArithmeticException { + number = Math.addExact( + Math.multiplyExact(number, base), + yyinput.charAt(yycursor \- 1) \- offset); + } + + public int parse(String str) throws ArithmeticException, IllegalArgumentException { + yyinput = str; + yycursor = 0; + int yycond = YYC_init; + + number = 0; + try { + loop: while (true) { + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + + <*> * { throw new IllegalArgumentException(\(dqill\-formed number\(dq); } + + \(aq0b\(aq / [01] :=> bin + \(dq0\(dq :=> oct + \(dq\(dq / [1\-9] :=> dec + \(aq0x\(aq / [0\-9a\-fA\-F] :=> hex + + \(dq\ex00\(dq { return number; } + + [01] { add_digit(2, 48); continue loop; } + [0\-7] { add_digit(8, 48); continue loop; } + [0\-9] { add_digit(10, 48); continue loop; } + [0\-9] { add_digit(16, 48); continue loop; } + [a\-f] { add_digit(16, 87); continue loop; } + [A\-F] { add_digit(16, 55); continue loop; } + */ + } + } catch (Exception e) { + return \-1; + } + } + + public static void main(String []args) { + Parser parser = new Parser(); + assert parser.parse(\(dq1234567890\e0\(dq) == 1234567890; + assert parser.parse(\(dq0b1101\e0\(dq) == 0b1101; + assert parser.parse(\(dq0x007Fe\e0\(dq) == 0x7fe; + assert parser.parse(\(dq0644\e0\(dq) == 0644; + assert parser.parse(\(dq9999999999\e0\(dq) == \-1; + assert parser.parse(\(dq123??\e0\(dq) == \-1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT \-f + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.Pipe; + +class Lexer { + enum Status { + END, + READY, + WAITING, + BIG_PACKET, + BAD_PACKET + }; + + // Use a small buffer to cover the case when a lexeme doesn\(aqt fit. + // In real world use a larger buffer. + public static final int BUFSIZE = 10; + + public static class State { + Pipe.SourceChannel source; + byte[] yyinput; + int yycursor; + int yymarker; + int yylimit; + int token; + int yystate; + int received; + + public State(Pipe pipe) { + source = pipe.source(); + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + yyinput = new byte[BUFSIZE + 1]; + yycursor = yymarker = yylimit = token = BUFSIZE; + yystate = \-1; + received = 0; + } + } + + private static void log(String format, Object... args) { + if (false) { System.out.printf(format + \(dq\en\(dq, args); } + } + + private static Status fill(State st) throws IOException { + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (st.token < 1) { return Status.BIG_PACKET; } + + // Shift buffer contents (discard everything up to the current token). + System.arraycopy(st.yyinput, st.token, st.yyinput, 0, st.yylimit \- st.token); + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Fill free space at the end of buffer with new data from file. + ByteBuffer buffer = ByteBuffer.wrap(st.yyinput, st.yylimit, BUFSIZE \- st.yylimit); + int have = st.source.read(buffer); + if (have != \-1) st.yylimit += have; // \-1 means that pipe is closed + st.yyinput[st.yylimit] = 0; // append sentinel symbol + + return Status.READY; + } + + private static Status lex(State yyrecord) { + byte yych; + loop: while (true) { + yyrecord.token = yyrecord.yycursor; + /*!re2c + re2c:api = record; + re2c:define:YYCTYPE = \(dqbyte\(dq; + re2c:define:YYPEEK = \(dqyyrecord.yyinput[yyrecord.yycursor]\(dq; + re2c:define:YYFILL = \(dqreturn Status.WAITING;\(dq; + re2c:eof = 0; + + packet = [a\-z]+[;]; + + * { return Status.BAD_PACKET; } + $ { return Status.END; } + packet { yyrecord.received += 1; continue loop; } + */ + } + } + + public static void test(String[] packets, Status expect) throws IOException { + // Create a pipe. + Pipe pipe = Pipe.open(); + Pipe.SinkChannel sink = pipe.sink(); + + // Initialize lexer state + Lexer.State st = new Lexer.State(pipe); + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + int send = 0; + Status status; + while (true) { + status = lex(st); + + if (status == Status.END) { + log(\(dqdone: got %d packets\(dq, st.received); + break; + } else if (status == Status.WAITING) { + log(\(dqwaiting...\(dq); + + if (send < packets.length) { + log(\(dqsent packet %d: %s\(dq, send, packets[send]); + ByteBuffer buffer = ByteBuffer.wrap(packets[send].getBytes()); + sink.write(buffer); + send += 1; + } else { + sink.close(); + } + + status = fill(st); + if (status == Status.BIG_PACKET) { + log(\(dqerror: packet too big\(dq); + break; + } + assert status == Status.READY; + } else { + assert status == Status.BAD_PACKET; + log(\(dqerror: ill\-formed packet\(dq); + break; + } + } + + // Check results. + assert status == expect; + if (status == Status.END) { + assert send == st.received; + } + } + + public static void main(String []args) throws IOException { + test(new String[]{}, Status.END); + test(new String[]{\(dqzero;\(dq, \(dqone;\(dq, \(dqtwo;\(dq, \(dqthree;\(dq, \(dqfour;\(dq}, Status.END); + test(new String[]{\(dqzer0;\(dq}, Status.BAD_PACKET); + test(new String[]{\(dqgoooooooooogle;\(dq}, Status.BIG_PACKET); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +// in both blocks; the \(aqfish\(aq block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +/*!rules:re2c:colors + * { throw new IllegalArgumentException(\(dqah\(dq); } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { return Ans.COLOR; } +*/ + +/*!rules:re2c:fish + * { throw new IllegalArgumentException(\(dqoh\(dq); } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { return Ans.FISH; } +*/ + +class Main { + enum Ans {COLOR, FISH, DUNNO}; + + static Ans lex(String yyinput) { // no\-throw, as \(aq*\(aq rules are overridden + int yycursor = 0; + int yymarker = 0; + + /*!re2c + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + + !use:fish; + !use:colors; + * { return Ans.DUNNO; } // overrides inherited \(aq*\(aq rules + */ + } + + public static void main(String []args) { + assert lex(\(dqsalmon\(dq) == Ans.FISH; + assert lex(\(dqwhat?\(dq) == Ans.DUNNO; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT \-\-input\-encoding utf8 + +// This example supports multiple input encodings: UTF\-8 and UTF\-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding\-specific configurations. + +/*!rules:re2c + re2c:yyfill:enable = 0; + re2c:define:YYPEEK = \(dqyyinput[yycursor]\(dq; + re2c:indent:top = 1; + + \(dq∀x ∃y\(dq { return true; } + * { return false; } +*/ + +class Main { + static boolean lex_utf8(int[] yyinput) { + int yycursor = 0; + int yymarker = 0; + /*!use:re2c + re2c:define:YYCTYPE = \(dqint\(dq; // should be \(gabyte\(ga, but it\(aqs signed in Java + re2c:encoding:utf8 = 1; + */ + } + + static boolean lex_utf32(int[] yyinput) { + int yycursor = 0; + int yymarker = 0; + /*!use:re2c + re2c:define:YYCTYPE = \(dqint\(dq; + re2c:encoding:utf32 = 1; + */ + } + + public static void main(String []args) { + // we have to use \(gaint\(ga, because \(gabyte\(gain Java cannot represent values greater than 127 + int[] s_utf8 = new int[]{0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79}; + assert lex_utf8(s_utf8); + + int[] s_utf32 = new int[]{0x2200, 0x78, 0x20, 0x2203, 0x79}; + assert lex_utf32(s_utf32); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +import java.util.Optional; + +class Main { + static class SemVer { + int major; + int minor; + int patch; + + public SemVer(int m, int n, int k) { + major = m; + minor = n; + patch = k; + } + + public boolean equals(SemVer v) { + return major == v.major && minor == v.minor && patch == v.patch; + } + }; + + static Optional parse(String yyinput) { + int yycursor = 0; + int yymarker = 0; + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqint @@;\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqint @@ = \-1;\(dq; */ + + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + int major = Integer.valueOf(yyinput.substring(t1, t2)); + int minor = Integer.valueOf(yyinput.substring(t3, t4)); + int patch = (t5 == \-1) ? 0 : Integer.valueOf(yyinput.substring(t5, yycursor \- 1)); + return Optional.of(new SemVer(major, minor, patch)); + } + * { return Optional.empty(); } + */ + } + + public static void main(String []args) { + assert parse(\(dq23.34\e0\(dq).get().equals(new SemVer(23, 34, 0)); + assert parse(\(dq1.2.99999\e0\(dq).get().equals(new SemVer(1, 2, 99999)); + assert !parse(\(dq1.a\e0\(dq).isPresent(); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +import java.io.*; +import java.nio.file.*; +import java.util.*; + +class Lexer { + static class SemVer { + int major; + int minor; + int patch; + + public SemVer(int m, int n, int k) { + major = m; + minor = n; + patch = k; + } + + public boolean equals(SemVer v) { + return major == v.major && minor == v.minor && patch == v.patch; + } + }; + + public static final int BUFSIZE = 4096; + + private BufferedInputStream stream; + private byte[] yyinput; + private int yycursor; + private int yymarker; + private int yylimit; + private int token; + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqprivate int @@;\en\(dq; */ + private boolean eof; + + public Lexer(File file) throws FileNotFoundException { + stream = new BufferedInputStream(new FileInputStream(file)); + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + yyinput = new byte[BUFSIZE + 1]; + yycursor = yymarker = yylimit = token = BUFSIZE; + /*!stags:re2c format = \(dq@@ = \-1;\en\(dq; */ + eof = false; + } + + private int fill() throws IOException { + if (eof) { return \-1; } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (token < 1) { return \-2; } + + // Shift buffer contents (discard everything up to the current token). + System.arraycopy(yyinput, token, yyinput, 0, yylimit \- token); + yycursor \-= token; + yymarker \-= token; + yylimit \-= token; + /*!stags:re2c format = \(dqif (@@ != \-1) {@@ \-= token;}\en\(dq; */ + token = 0; + + // Fill free space at the end of buffer with new data from file. + yylimit += stream.read(yyinput, yylimit, BUFSIZE \- yylimit); + yyinput[yylimit] = 0; // append sentinel symbol + + // If read less than expected, this is the end of input. + eof = yylimit < BUFSIZE; + + return 0; + } + + private int readInt(int tag1, int tag2) { + int n = 0; + for (int i = tag1; i < tag2; ++i) { n = n * 10 + (yyinput[i] \- 48); } + return n; + } + + public Optional> lex() throws IOException { + ArrayList vers = new ArrayList(); + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqint @@;\(dq; */ + + loop: while (true) { + token = yycursor; + /*!re2c + re2c:define:YYCTYPE = \(dqbyte\(dq; + re2c:define:YYPEEK = \(dqyyinput[yycursor]\(dq; + re2c:define:YYFILL = \(dqfill() == 0\(dq; + re2c:eof = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\en] { + int major = readInt(t1, t2); + int minor = readInt(t3, t4); + int patch = (t5 == \-1) ? 0 : readInt(t5, yycursor \- 1); + vers.add(new SemVer(major, minor, patch)); + continue loop; + } + $ { return Optional.of(vers); } + * { return Optional.empty(); } + */ + } + } + + public static void main(String []args) throws FileNotFoundException, IOException { + String fname = \(dqinput\(dq; + String content = \(dq1.22.333\en\(dq.repeat(Lexer.BUFSIZE); + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + Files.writeString(Paths.get(fname), content); + + // Prepare lexer state: all offsets are at the end of buffer. + File file = new File(\(dq.\(dq, fname); + Lexer lexer = new Lexer(file); + + // Run the lexer. + Optional> vers = lexer.lex(); + + // Check resuts. + assert vers.isPresent() && vers.get().size() == BUFSIZE; + SemVer v = new SemVer(1, 22, 333); + for (int i = 0; i < BUFSIZE; ++i) { + assert vers.get().get(i).equals(v); + } + + // Cleanup: remove input file. + file.delete(); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +import java.util.Optional; + +class Main { + static class SemVer { + int major; + int minor; + int patch; + + public SemVer(int m, int n, int k) { + major = m; + minor = n; + patch = k; + } + + public boolean equals(SemVer v) { + return major == v.major && minor == v.minor && patch == v.patch; + } + }; + + static Optional parse(String yyinput) { + int yycursor = 0; + int yymarker = 0; + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqint @@;\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqint @@ = \-1;\(dq; */ + + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + re2c:captvars = 1; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + int major = Integer.valueOf(yyinput.substring(yytl1, yytr1)); + int minor = Integer.valueOf(yyinput.substring(yytl2, yytr2)); + int patch = (yytl3 == \-1) ? 0 + : Integer.valueOf(yyinput.substring(yytl3 + 1, yytr3)); + return Optional.of(new SemVer(major, minor, patch)); + } + * { return Optional.empty(); } + */ + } + + public static void main(String []args) { + assert parse(\(dq23.34\e0\(dq).get().equals(new SemVer(23, 34, 0)); + assert parse(\(dq1.2.99999\e0\(dq).get().equals(new SemVer(1, 2, 99999)); + assert !parse(\(dq1.a\e0\(dq).isPresent(); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +import java.util.*; + +class Main { + static Optional parse(String yyinput) { + int yycursor = 0; + int yymarker = 0; + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqint @@;\(dq; */ + /*!mvars:re2c format = \(dqList @@;\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqint @@ = \-1;\(dq; */ + /*!mtags:re2c format = \(dqList @@ = new ArrayList<>();\(dq; */ + + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:define:YYMTAGP = \(dq@@.add(yycursor);\(dq; + re2c:define:YYMTAGN = \(dq\(dq; // do nothing + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + int[] vers = new int[t3.size() + 1]; + vers[0] = Integer.valueOf(yyinput.substring(t1, t2)); + for (int i = 0; i < t3.size(); ++i) { + vers[i + 1] = Integer.valueOf(yyinput.substring(t3.get(i), t4.get(i))); + } + return Optional.of(vers); + } + * { return Optional.empty(); } + */ + } + + public static void main(String []args) { + assert Arrays.equals(parse(\(dq1\e0\(dq).get(), new int[]{1}); + assert Arrays.equals(parse(\(dq1.2.3.4.5.6.7\e0\(dq).get(), new int[]{1, 2, 3, 4, 5, 6, 7}); + assert !parse(\(dq1.2.\e0\(dq).isPresent(); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT \-\-utf8 \-s + +/*!include:re2c \(dqunicode_categories.re\(dq */ + +class Main { + static boolean lex(String yyinput) { + int yycursor = 0; + int yymarker = 0; + + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start+; + // It should be \(gaid_start id_continue*\(ga, but that causes \(gaerror: code too large\(ga + + identifier { return true; } + * { return false; } + */ + } + + public static void main(String []args) { + assert lex(\(dq_Ыдентификатор\e0\(dq); + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.java) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +/*!re2c + number = [1\-9][0\-9]*; +*/ + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { return Num.FLOAT; } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT + +/*!include:re2c \(dqdefinitions.java\(dq */ + +class Main { + enum Num {INT, FLOAT, NAN}; + + static Num lex(String yyinput) { + int yycursor = 0; + int yymarker = 0; + + /*!re2c + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyinput.charAt(yycursor)\(dq; + re2c:yyfill:enable = 0; + + * { return Num.NAN; } + number { return Num.INT; } + !include \(dqextra_rules.re.inc\(dq; + */ + } + + public static void main(String []args) { + assert lex(\(dq123\e0\(dq) == Num.INT; + assert lex(\(dq123.4567\e0\(dq) == Num.FLOAT; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2java $INPUT \-o $OUTPUT \-\-header lexer/state.java + +package headers; + +import headers.lexer.State; + +/*!header:re2c:on*/ +package headers.lexer; + +public class State { + public String yyinput; + public int yycursor; + /*!stags:re2c format = \(dqpublic int @@;\en\(dq; */ + + public State(String str) { + yyinput = str; + yycursor = 0; + /*!stags:re2c format = \(dq@@ = 0;\en\(dq; */ + } +}; +/*!header:re2c:off*/ + +class Main { + static int lex(String str) { + State yyrecord = new State(str); + int t; + /*!re2c + re2c:api = record; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:define:YYCTYPE = \(dqchar\(dq; + re2c:define:YYPEEK = \(dqyyrecord.yyinput.charAt(yyrecord.yycursor)\(dq; + re2c:header = \(dqlexer/state.java\(dq; + + [a]* @t [b]* { return t; } + */ + } + + public static void main(String []args) { + assert lex(\(dqab\e0\(dq) == 1; + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2c + +package headers.lexer; + +public class State { + public String yyinput; + public int yycursor; + public int yyt1; + + + public State(String str) { + yyinput = str; + yycursor = 0; + yyt1 = 0; + + } +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/doc/re2js.1 b/bootstrap/doc/re2js.1 new file mode 100644 index 000000000..f2f15edd2 --- /dev/null +++ b/bootstrap/doc/re2js.1 @@ -0,0 +1,3327 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for JavaScript, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +function lex(yyinput) { + let yycursor = 0; + /*!re2c + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + + number { return true; } + * { return false; } + */ +} + +if (!lex(\(dq1234\e0\(dq)) { + throw \(dqerror!\(dq +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2js +// re2js $INPUT \-o $OUTPUT + +function lex(yyinput) { + let yycursor = 0; + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + yycursor += 1; + switch (yych) { + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yystate = 2 + continue yyl + default: + yystate = 1 + continue yyl + } + case 1: + { return false; } + case 2: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 2 + continue yyl + default: + yystate = 3 + continue yyl + } + case 3: + { return true; } + default: + throw \(dqinternal lexer error\(dq + } + } +} + +} + +if (!lex(\(dq1234\e0\(dq)) { + throw \(dqerror!\(dq +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2js has three API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBSimple API\fP +This is the default API for the JavaScript backend. It consists of the +following primitives: \fBYYINPUT\fP (which should be defined as a sequence of +code units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, +\fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). +.nf + +.fi +.sp +.TP +.B \fBRecord API\fP +Record API is useful in cases when lexer state must be stored in an object. +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the +name can be overridden with \fBre2c:variable:yyrecord\fP) that should be +defined as an object with properties \fByyinput\fP, \fByycursor\fP, +\fByymarker\fP, \fByyctxmarker\fP, \fByylimit\fP (only the fields used by the +generated code need to be defined, and their names can be configured). +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// expects a null\-terminated string +function lex(yyinput) { + let yycursor = 0; + let count = 0 + + loop: while (true) { + /*!re2c + re2c:yyfill:enable = 0; + + * { return \-1 } + [\ex00] { return count } + [ ]+ { continue loop } + [a\-z]+ { count += 1; continue loop } + */ + } +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } +test(\(dq\e0\(dq, 0) +test(\(dqone two three\e0\(dq, 3) +test(\(dqf0ur\e0\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// expects a null\-terminated string +function lex(yyinput) { + let yycursor = 0; + let yylimit = yyinput.length \- 1 // terminating null not included + let count = 0 + + loop: while (true) { + /*!re2c + re2c:yyfill:enable = 0; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1 } + $ { return count } + [ ]+ { continue loop } + str { count += 1; continue loop } + */ + } +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } +test(\(dq\e0\(dq, 0) +test(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \e0\(dq, 3) +test(\(dq\(aqunterminated\e\e\(aq\e0\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +/*!max:re2c*/ + +function lex(yyinput) { + let yycursor = 0; + let yylimit = yyinput.length + let count = 0 + + loop: while (true) { + /*!re2c + re2c:define:YYFILL = \(dqreturn \-1\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // check that it is the sentinel, not some unexpected null + return (yycursor == yylimit \- YYMAXFILL + 1) ? count : \-1 + } + str { count += 1; continue loop } + [ ]+ { continue loop } + * { return \-1 } + */ + } +} + +function test(s, n) { + let padded_s = s + \(dq\e0\(dq.repeat(YYMAXFILL) + if (lex(padded_s) != n) throw \(dqerror!\(dq +} + +test(\(dq\(dq, 0) +test(\(dq\(aqunterminated\e\e\(aq\(dq, \-1) +test(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq, 3) +test(\(dq\(aqunexpected \e0 null\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// expects a string without terminating null +function lex(str) { + let cur = 0; + let lim = str.length + let count = 0 + + loop: while (true) { + /*!re2c + re2c:api = generic; + re2c:define:YYPEEK = \(dqcur < lim ? str.charCodeAt(cur) : 0\(dq; + re2c:define:YYSKIP = \(dqcur += 1\(dq; + re2c:yyfill:enable = 0; + + * { return \-1 } + [\ex00] { return count } + [ ]+ { continue loop } + [a\-z]+ { count += 1; continue loop } + */ + } +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } +test(\(dq\(dq, 0) +test(\(dqone two three\(dq, 3) +test(\(dqf0ur\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const fs = require(\(aqfs\(aq) + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 + +function fill(st) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.eof = nread < want // end of file? + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return OK +} + +function lex(yyrecord, count) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:define:YYPEEK = \(dqreadUInt8\(dq; + re2c:define:YYFILL = \(dqfill(yyrecord) == OK\(dq; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1 } + $ { return count } + [ ]+ { continue loop } + str { count += 1; continue loop } + */ + } +} + +function main() { + let fname = \(dqinput\(dq + + // Create input file. + let content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq.repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fs.openSync(fname, \(aqr\(aq), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw \(dqerror :[\(dq } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const fs = require(\(aqfs\(aq) + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 +/*!max:re2c*/ + +function fill(st, need) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < need) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + if (nread < want) { + st.eof = true // end of file + st.yyinput.write(\(dq\e0\(dq.repeat(YYMAXFILL), st.yylimit) + st.yylimit += YYMAXFILL + } + + return OK +} + +function lex(yyrecord, count) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:define:YYPEEK = \(dqreadUInt8\(dq; + re2c:define:YYFILL = \(dqif (fill(yyrecord, @@) != OK) return \-1;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return yyrecord.token == yyrecord.yylimit \- YYMAXFILL ? count : \-1 + } + str { count += 1; continue loop } + [ ]+ { continue loop } + * { return \-1 } + */ + } +} + +function main() { + let fname = \(dqinput\(dq + + // Create input file. + let content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq.repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fs.openSync(fname, \(aqr\(aq), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw \(dqerror :[\(dq } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +/*!re2c // Common re2c definitions shared between all functions. + re2c:api = record; + re2c:variable:yyrecord = st; + re2c:yyfill:enable = 0; +*/ + +function parse_u32(str) { + let st = { + yyinput: str, + yycursor: 0, + yymarker: 0 + } + /*!re2c + \(aq0b\(aq / [01] { return parse_bin(st) } + \(dq0\(dq { return parse_oct(st) } + \(dq\(dq / [1\-9] { return parse_dec(st) } + \(aq0x\(aq / [0\-9a\-fA\-F] { return parse_hex(st) } + * { return null } + */ +} + +function parse_bin(st) { + n = 0 + loop: while (true) { + /*!re2c + [01] { n = n * 2 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + * { return n } + */ + } +} + +function parse_oct(st) { + n = 0 + loop: while (true) { + /*!re2c + [0\-7] { n = n * 8 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + * { return n } + */ + } +} + +function parse_dec(st) { + n = 0 + loop: while (true) { + /*!re2c + [0\-9] { n = n * 10 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + * { return n } + */ + } +} + +function parse_hex(st) { + n = 0 + loop: while (true) { + /*!re2c + [0\-9] { n = n * 16 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + [a\-f] { n = n * 16 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 87); continue loop } + [A\-F] { n = n * 16 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 55); continue loop } + * { return n } + */ + } +} + +function test(s, n) { + if (parse_u32(s) != n) throw \(dqerror!\(dq +} + +test(\(dq\e0\(dq, null) +test(\(dq1234567890\e0\(dq, 1234567890) +test(\(dq0b1101\e0\(dq, 13) +test(\(dq0x7Fe\e0\(dq, 2046) +test(\(dq0644\e0\(dq, 420) +test(\(dq9999999999\e0\(dq, 9999999999) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-c + +/*!conditions:re2c*/ + +function parse_u32(yyinput) { + let yycursor = 0 + let yycond = YYC_INIT + let n = 0 + + loop: while (true) { + /*!re2c + re2c:yyfill:enable = 0; + re2c:indent:top = 2; + + \(aq0b\(aq / [01] :=> BIN + \(dq0\(dq :=> OCT + \(dq\(dq / [1\-9] :=> DEC + \(aq0x\(aq / [0\-9a\-fA\-F] :=> HEX + * { return null } + + [01] { n = n * 2 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [0\-7] { n = n * 8 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [0\-9] { n = n * 10 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [0\-9] { n = n * 16 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [a\-f] { n = n * 16 + (yyinput.charCodeAt(yycursor \- 1) \- 87); continue loop } + [A\-F] { n = n * 16 + (yyinput.charCodeAt(yycursor \- 1) \- 55); continue loop } + + * { return n } + */ + } +} + +function test(s, n) { + if (parse_u32(s) != n) throw \(dqerror!\(dq +} + +test(\(dq\e0\(dq, null) +test(\(dq1234567890\e0\(dq, 1234567890) +test(\(dq0b1101\e0\(dq, 13) +test(\(dq0x7Fe\e0\(dq, 2046) +test(\(dq0644\e0\(dq, 420) +test(\(dq9999999999\e0\(dq, 9999999999) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-f + +const fs = require(\(aqfs\(aq) + +// Use a small buffer to cover the case when a lexeme doesn\(aqt fit. +// In real world use a larger buffer. +const BUFSIZE = 10 +const DEBUG = false +const END = 0 +const READY = 1 +const WAITING = 2 +const BIG_PACKET = 3 +const BAD_PACKET = 4 + +function log() { + if (DEBUG) console.log.apply(console, arguments) +} + +function fill(st) { + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return BIG_PACKET + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return READY +} + +function lex(yyrecord) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:define:YYPEEK = \(dqreadUInt8\(dq; + re2c:define:YYFILL = \(dqreturn WAITING\(dq; + re2c:eof = 0; + + packet = [a\-z]+[;]; + + * { return BAD_PACKET } + $ { return END } + packet { yyrecord.received += 1; continue loop } + */ + } +} + +function test(packets, expect) { + // Emulate a \(dqpipe\(dq by opening the same file for reading and writing. + let fname = \(dqinput\(dq + let fw = fs.openSync(fname, \(aqw\(aq); + let fr = fs.openSync(fname, \(aqr\(aq); + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fr, + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + yystate: \-1, + received: 0 + } + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + let send = 0 + let status + loop: while (true) { + status = lex(st) + + if (status == END) { + log(\(dqdone: got\(dq, st.received, \(dqpackets\(dq) + break loop + } else if (status == WAITING) { + log(\(dqwaiting...\(dq); + + if (send < packets.length) { + log(\(dqsent packet\(dq, send, packets[send]) + fs.writeFileSync(fw, packets[send]) + send += 1 + } + + status = fill(st) + log(\(dqqueue:\(dq, st.yyinput.toString()) + if (status == BIG_PACKET) { + log(\(dqerror: packet too big\(dq) + break loop + } + + if (status != READY) throw \(dqexpected READY\(dq + } else { + if (status != BAD_PACKET) throw \(dqexpected BAD_PACKET\(dq + log(\(dqerror: ill\-formed packet\(dq) + break loop + } + } + + // Check results. + if (status != expect) throw \(dqunexpected status\(dq + if (status == END && st.received != send) \(dqunexpected packet count\(dq + + // Cleanup. + fs.unlinkSync(fname, function(err){ if (err) throw err; }) +} + +function main() { + test([], END) + test([\(dqzero;\(dq, \(dqone;\(dq, \(dqtwo;\(dq, \(dqthree;\(dq, \(dqfour;\(dq], END) + test([\(dqzer0;\(dq], BAD_PACKET) + test([\(dqgoooooooooogle;\(dq], BIG_PACKET) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +// in both blocks; the \(aqfish\(aq block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +const COLOR = 1 +const FISH = 2 +const DUNNO = 3 + +/*!rules:re2c:colors + * { throw \(dqah\(dq } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { return COLOR } +*/ + +/*!rules:re2c:fish + * { throw \(dqoh\(dq } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { return FISH } +*/ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { return DUNNO } // overrides inherited \(aq*\(aq rules + */ +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } + +test(\(dqsalmon\(dq, FISH) +test(\(dqwhat?\(dq, DUNNO) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-\-input\-encoding utf8 + +// This example supports multiple input encodings: UTF\-8 and UTF\-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding\-specific configurations. +/*!rules:re2c + re2c:yyfill:enable = 0; + re2c:define:YYPEEK = \(dqat\(dq; + + \(dq∀x ∃y\(dq { return yycursor } + * { return null } +*/ + +function lex_utf8(yyinput) { + let yycursor = 0 + /*!use:re2c + re2c:encoding:utf8 = 1; + */ +} + +function lex_utf32(yyinput) { + let yycursor = 0 + /*!use:re2c + re2c:encoding:utf32 = 1; + */ +} + +function test(f, s) { + if (f(s) != s.length) throw \(dqerror!\(dq +} + +test(lex_utf8, [0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79]) +test(lex_utf32, [0x2200, 0x78, 0x20, 0x2203, 0x79]) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqlet @@\en\(dq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + return { + major: Number(yyinput.substring(t1, t2)), + minor: Number(yyinput.substring(t3, t4)), + patch: t5 == \-1 ? 0 : Number(yyinput.substring(t5, yycursor \- 1)) + } + } + * { return null } + */ +} + +assert.deepEqual(parse(\(dq23.34\e0\(dq), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse(\(dq1.2.99999\e0\(dq), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse(\(dq1.a\e0\(dq), null) + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq); +const fs = require(\(aqfs\(aq) + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 + +function fill(st) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + /*!stags:re2c format = \(dqif (st.@@ != \-1) st.@@ \-= st.token\en\(dq; */ + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.eof = nread < want // end of file? + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return OK +} + +function lex(st) { + let vers = [] + loop: while (true) { + st.token = st.yycursor + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + + /*!re2c + re2c:api = record; + re2c:variable:yyrecord = st; + re2c:define:YYPEEK = \(dqreadUInt8\(dq; + re2c:define:YYFILL = \(dqfill(st) == OK\(dq; + re2c:eof = 0; + re2c:tags = 1; + + num = [0\-9]+; + + num @t1 \(dq.\(dq @t2 num @t3 (\(dq.\(dq @t4 num)? [\en] { + vers.push({ + major: Number(st.yyinput.subarray(st.token, t1)), + minor: Number(st.yyinput.subarray(t2, t3)), + patch: t4 == \-1 ? 0 : Number(st.yyinput.subarray(t4, st.yycursor \- 1)) + }) + continue loop + } + $ { return vers } + * { return null } + */ + } +} + +function main() { + let fname = \(dqinput\(dq + + // Create input file. + let content = \(dq1.22.333\en\(dq.repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fs.openSync(fname, \(aqr\(aq), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dq@@: \-1,\en\(dq; */ + eof: false + } + + // Run lexer on the prepared file. + assert.deepEqual(lex(st), Array(BUFSIZE).fill({major: 1, minor: 22, patch: 333})) + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqlet @@\en\(dq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:captvars = 1; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + return { + major: Number(yyinput.substring(yytl1, yytr1)), + minor: Number(yyinput.substring(yytl2, yytr2)), + patch: yytl3 == \-1 ? 0 : Number(yyinput.substring(yytl3 + 1, yytr3)) + } + } + * { return null } + */ +} + +assert.deepEqual(parse(\(dq23.34\e0\(dq), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse(\(dq1.2.99999\e0\(dq), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse(\(dq1.a\e0\(dq), null) + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq) + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + /*!mvars:re2c format = \(dqlet @@\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqlet @@\en\(dq; */ + /*!mtags:re2c format = \(dqlet @@ = []\en\(dq; */ + + /*!re2c + re2c:define:YYMTAGP = \(dq@@.push(yycursor)\(dq; + re2c:define:YYMTAGN = \(dq\(dq; // do nothing + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + let vers = [Number(yyinput.substring(t1, t2))] + for (let i = 0; i < t3.length; ++i) { + vers.push(Number(yyinput.substring(t3[i], t4[i]))) + } + return vers + } + * { return null } + */ +} + +assert.deepEqual(parse(\(dq1\e0\(dq), [1]) +assert.deepEqual(parse(\(dq1.2.3.4.5.6.7\e0\(dq), [1, 2, 3, 4, 5, 6, 7]) +assert.deepEqual(parse(\(dq1.2.\e0\(dq), null) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-\-utf8 \-s + +/*!include:re2c \(dqunicode_categories.re\(dq */ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + + identifier { return true } + * { return false } + */ +} + +if (!lex(\(dq_Ыдентификатор\e0\(dq)) throw \(dqerror!\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.js) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +const INT = 1 +const FLOAT = 2 +const NAN = 3 + +/*!re2c + number = [1\-9][0\-9]*; +*/ + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { return FLOAT } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +/*!include:re2c \(dqdefinitions.js\(dq */ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + * { return NAN } + number { return INT } + !include \(dqextra_rules.re.inc\(dq; + */ +} + +function test(s, n) { + if (lex(s) != n) throw \(dqerror!\(dq +} + +test(\(dq123\e0\(dq, INT) +test(\(dq123.4567\e0\(dq, FLOAT) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-\-header lexer/state.js + +let state = require(\(aq./lexer/state.js\(aq); + +/*!header:re2c:on*/ +exports.mk_state = function(str) { + return { + yyinput: str, + /*!stags:re2c format = \(dq@@: 0,\en\(dq; */ + yycursor: 0 + } +} +/*!header:re2c:off*/ + +function lex(yyrecord) { + let t + /*!re2c + re2c:api = record; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:header = \(dqlexer/state.js\(dq; + + [a]* @t [b]* { return t } + */ +} + +if (lex(state.mk_state(\(dqab\e0\(dq)) != 1) { + throw \(dqerror!\(dq +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2c + +exports.mk_state = function(str) { + return { + yyinput: str, + yyt1: 0, + + yycursor: 0 + } +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/doc/re2ocaml.1 b/bootstrap/doc/re2ocaml.1 new file mode 100644 index 000000000..36970e932 --- /dev/null +++ b/bootstrap/doc/re2ocaml.1 @@ -0,0 +1,3487 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for OCaml, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-i *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; +} + +%{ + re2c:define:YYFN = [\(dqlex;bool\(dq, \(dqyyrecord;state\(dq]; + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + + number { true } + * { false } +%} + +let main () = + let st = {yyinput = \(dq1234\ex00\(dq; yycursor = 0} + in if not (lex st) then raise (Failure \(dqerror\(dq) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* Generated by re2ocaml *) +(* re2ocaml $INPUT \-o $OUTPUT \-i *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; +} + + +let rec yy0 (yyrecord : state) : bool = + let yych = get yyrecord.yyinput yyrecord.yycursor in + yyrecord.yycursor <\- yyrecord.yycursor + 1; + match yych with + | \(aq1\(aq..\(aq9\(aq \-> (yy2 [@tailcall]) yyrecord + | _ \-> (yy1 [@tailcall]) yyrecord + +and yy1 (yyrecord : state) : bool = + false + +and yy2 (yyrecord : state) : bool = + let yych = get yyrecord.yyinput yyrecord.yycursor in + match yych with + | \(aq0\(aq..\(aq9\(aq \-> + yyrecord.yycursor <\- yyrecord.yycursor + 1; + (yy2 [@tailcall]) yyrecord + | _ \-> (yy3 [@tailcall]) yyrecord + +and yy3 (yyrecord : state) : bool = + true + +and lex (yyrecord : state) : bool = + (yy0 [@tailcall]) yyrecord + + + +let main () = + let st = {yyinput = \(dq1234\ex00\(dq; yycursor = 0} + in if not (lex st) then raise (Failure \(dqerror\(dq) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2ocaml has two API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBRecord API\fP +Record API is the default API for the OCaml backend. +This API consists of a variable \fByyrecord\fP (the name can be overridden with +\fBre2c:variable:yyrecord\fP) that should be defined as a record with fields +\fB_yyinput\fP, \fB_yycursor\fP, \fB_yymarker\fP, \fB_yyctxmarker\fP, \fB_yylimit\fP\&. +Only the fields used by the generated code need to be defined, and their +names can be configured. +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; +} + +(* expect a null\-terminated string *) +%{ + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqyyrecord;state\(dq, \(dqcount;int\(dq]; + re2c:yyfill:enable = 0; + + * { \-1 } + [\ex00] { count } + [a\-z]+ { lex yyrecord (count + 1) } + [ ]+ { lex yyrecord count } +%} + +let test(yyinput, count) = + let st = {yyinput = yyinput; yycursor = 0} + in if not (lex st 0 = count) then raise (Failure \(dqerror\(dq) + +let main () = + test(\(dq\ex00\(dq, 0); + test(\(dqone two three\ex00\(dq, 3); + test(\(dqf0ur\ex00\(dq, \-1) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + yylimit: int; +} + +(* expect a null\-terminated string *) +%{ + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqyyrecord;state\(dq, \(dqcount;int\(dq]; + re2c:yyfill:enable = 0; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { \-1 } + $ { count } + str { lex yyrecord (count + 1) } + [ ]+ { lex yyrecord count } +%} + +let test(str, count) = + let st = { + yyinput = str; + yycursor = 0; + yymarker = 0; + yylimit = length str \- 1; (* terminating null not included *) + } + in if not (lex st 0 = count) then raise (Failure \(dqerror\(dq) + +let main () = + test(\(dq\ex00\(dq, 0); + test(\(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \ex00\(dq, 3); + test(\(dq\(aqunterminated\e\e\(aq\ex00\(dq, \-1) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open String + +exception Fill + +type state = { + yyinput: string; + mutable yycursor: int; + yylimit: int; +} + +%{max %} +%{ + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqyyrecord;state\(dq, \(dqcount;int\(dq]; + re2c:define:YYFILL = \(dqraise Fill;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + (* check that it is the sentinel, not some unexpected null *) + if yyrecord.yycursor = length yyrecord.yyinput \- yymaxfill + 1 then count else \-1 + } + str { lex yyrecord (count + 1) } + [ ]+ { lex yyrecord count } + * { \-1 } +%} + +let test(str, count) = + let buf = cat str (make yymaxfill \(aq\ex00\(aq) in + let st = {yyinput = buf; yycursor = 0; yylimit = length buf} in + let result = try lex st 0 with Fill \-> \-1 in + if not (result = count) then raise (Failure \(dqerror\(dq) + +let main () = + test(\(dq\(dq, 0); + test(\(dq\(aqunterminated\e\e\(aq\(dq, \-1); + test(\(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq, 3); + test(\(dq\(aqunexpected \ex00 null\(dq, \-1) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +type state = { + str: string; + mutable cur: int; + lim: int; +} + +(* expect a string without terminating null *) +%{ + re2c:api = generic; + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqst;state\(dq, \(dqcount;int\(dq]; + re2c:define:YYPEEK = \(dqif st.cur < st.lim then st.str.[st.cur] else \(aq\e\ex00\(aq\(dq; + re2c:define:YYSKIP = \(dqst.cur <\- st.cur + 1;\(dq; + re2c:yyfill:enable = 0; + + * { \-1 } + [\ex00] { count } + [a\-z]+ { lex st (count + 1) } + [ ]+ { lex st count } +%} + +let test(str, count) = + let st = {str = str; cur = 0; lim = String.length str} + in if not (lex st 0 = count) then raise (Failure \(dqerror\(dq) + +let main () = + test(\(dq\(dq, 0); + test(\(dqone two three\(dq, 3); + test(\(dqf0ur\(dq, \-1) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open Bytes + +let bufsize = 4096 + +type state = { + file: in_channel; + yyinput: bytes; + mutable yycursor: int; + mutable yymarker: int; + mutable yylimit: int; + mutable token: int; + mutable eof: bool; +} + +type status = Ok | Eof | LongLexeme + +let fill(st: state) : status = + if st.eof then Eof else + + (* Error: lexeme too long. In real life could reallocate a larger buffer. *) + if st.token < 1 then LongLexeme else ( + + (* Shift buffer contents (discard everything up to the current token). *) + blit st.yyinput st.token st.yyinput 0 (st.yylimit \- st.token); + st.yycursor <\- st.yycursor \- st.token; + st.yymarker <\- st.yymarker \- st.token; + st.yylimit <\- st.yylimit \- st.token; + st.token <\- 0; + + (* Fill free space at the end of buffer with new data from file. *) + let n = input st.file st.yyinput st.yylimit (bufsize \- st.yylimit \- 1) in (* \-1 for sentinel *) + st.yylimit <\- st.yylimit + n; + if n = 0 then + st.eof <\- true; (* end of file *) + set st.yyinput st.yylimit \(aq\ex00\(aq; (* append sentinel *) + + Ok) + +%{ + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqyyrecord;state\(dq, \(dqcount;int\(dq]; + re2c:define:YYFILL = \(dqfill yyrecord = Ok\(dq; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { \-1 } + $ { count } + str { lex_loop yyrecord (count + 1) } + [ ]+ { lex_loop yyrecord count } +%} + +and lex_loop st count = + st.token <\- st.yycursor; + lex st count + +let main () = + let fname = \(dqinput\(dq in + + (* Prepare input file. *) + Out_channel.with_open_bin fname + (fun oc \-> for i = 1 to bufsize do + output_string oc \(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq + done); + + (* Run lexer on the prepared file. *) + In_channel.with_open_bin fname + (fun ic \-> + let yylimit = bufsize \- 1 in + let st = { + file = ic; + yyinput = create bufsize; + yycursor = yylimit; + yymarker = yylimit; + yylimit = yylimit; + token = yylimit; + eof = false; + } in if not (lex_loop st 0 = 3 * bufsize) then + raise (Failure \(dqerror\(dq)); + + (* Cleanup. *) + Sys.remove fname + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open Bytes + +%{max %} +let bufsize = 4096 + +exception Fill + +type state = { + file: in_channel; + yyinput: bytes; + mutable yycursor: int; + mutable yymarker: int; + mutable yylimit: int; + mutable token: int; + mutable eof: bool; +} + +type status = Ok | Eof | LongLexeme + +let fill (st: state) (need: int) : status = + if st.eof then Eof else + + (* Error: lexeme too long. In real life could reallocate a larger buffer. *) + if st.token < need then LongLexeme else ( + + (* Shift buffer contents (discard everything up to the current token). *) + blit st.yyinput st.token st.yyinput 0 (st.yylimit \- st.token); + st.yycursor <\- st.yycursor \- st.token; + st.yymarker <\- st.yymarker \- st.token; + st.yylimit <\- st.yylimit \- st.token; + st.token <\- 0; + + (* Fill free space at the end of buffer with new data from file. *) + let n = input st.file st.yyinput st.yylimit (bufsize \- st.yylimit \- 1) in (* \-1 for sentinel *) + st.yylimit <\- st.yylimit + n; + + (* If read zero characters, this is end of input => add zero padding + so that the lexer can access characters at the end of buffer. *) + if n = 0 then + st.eof <\- true; (* end of file *) + for i = 0 to (yymaxfill \- 1) do + set st.yyinput (st.yylimit + i) \(aq\ex00\(aq; + st.yylimit <\- st.yylimit + yymaxfill + done; + + Ok) + +%{ + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqyyrecord;state\(dq, \(dqcount;int\(dq]; + re2c:define:YYFILL = \(dqif not (fill yyrecord @@ = Ok) then raise Fill;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + (* check that it is the sentinel, not some unexpected null *) + if yyrecord.token = yyrecord.yylimit \- yymaxfill then count else \-1 + } + str { lex_loop yyrecord (count + 1) } + [ ]+ { lex_loop yyrecord count } + * { \-1 } +%} + +and lex_loop st count = + st.token <\- st.yycursor; + try lex st count with Fill \-> \-1 + +let main () = + let fname = \(dqinput\(dq in + + (* Prepare input file. *) + Out_channel.with_open_bin fname + (fun oc \-> for i = 1 to bufsize do + output_string oc \(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq + done); + + (* Run lexer on the prepared file. *) + In_channel.with_open_bin fname + (fun ic \-> + let yylimit = bufsize \- yymaxfill in + let st = { + file = ic; + yyinput = create bufsize; + yycursor = yylimit; + yymarker = yylimit; + yylimit = yylimit; + token = yylimit; + eof = false; + } in if not (lex_loop st 0 = 3 * bufsize) then + raise (Failure \(dqerror\(dq)); + + (* Cleanup. *) + Sys.remove fname + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-i *) + +open Int64 +open Option +open String + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; +} + +let add (num: int option) (dgt: int) (base: int) : int option = + match num with + | None \-> None + | Some n \-> + let n\(aq = add (mul (of_int n) (of_int base)) (of_int dgt) + in if n\(aq > (of_int32 Int32.max_int) then None else Some (to_int n\(aq) + +%{ + re2c:variable:yyrecord = \(dqst\(dq; + re2c:yyfill:enable = 0; +%} + +%{local + re2c:define:YYFN = [\(dqparse_bin;int option\(dq, \(dqst;state\(dq, \(dqnum;int option\(dq]; + [01] { parse_bin st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 2) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse_oct;int option\(dq, \(dqst;state\(dq, \(dqnum;int option\(dq]; + [0\-7] { parse_oct st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 8) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse_dec;int option\(dq, \(dqst;state\(dq, \(dqnum;int option\(dq]; + [0\-9] { parse_dec st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 10) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse_hex;int option\(dq, \(dqst;state\(dq, \(dqnum;int option\(dq]; + [0\-9] { parse_hex st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 16) } + [a\-f] { parse_hex st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 87) 16) } + [A\-F] { parse_hex st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 55) 16) } + * { num } +%} + +%{local + re2c:define:YYFN = [\(dqparse;int option\(dq, \(dqst;state\(dq]; + \(aq0b\(aq / [01] { parse_bin st (Some 0) } + \(dq0\(dq { parse_oct st (Some 0) } + \(dq\(dq / [1\-9] { parse_dec st (Some 0) } + \(aq0x\(aq / [0\-9a\-fA\-F] { parse_hex st (Some 0) } + * { None } +%} + +let test (yyinput: string) (result: int option) = + let st = {yyinput = yyinput; yycursor = 0; yymarker = 0} in + if not (parse st = result) then raise (Failure \(dqerror\(dq) + +let main () = + test \(dq\ex00\(dq None; + test \(dq1234567890\ex00\(dq (Some 1234567890); + test \(dq0b1101\ex00\(dq (Some 13); + test \(dq0x7Fe\ex00\(dq (Some 2046); + test \(dq0644\ex00\(dq (Some 420); + test \(dq9999999999\ex00\(dq None + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-ci *) + +open Int64 +open Option +open String + +%{conditions %} + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + mutable yycond: yycondtype; +} + +let add (num: int option) (dgt: int) (base: int) : int option = + match num with + | None \-> None + | Some n \-> + let n\(aq = add (mul (of_int n) (of_int base)) (of_int dgt) + in if n\(aq > (of_int32 Int32.max_int) then None else Some (to_int n\(aq) + +%{ + re2c:define:YYFN = [\(dqparse;int option\(dq, \(dqst;state\(dq, \(dqnum;int option\(dq]; + re2c:variable:yyrecord = \(dqst\(dq; + re2c:yyfill:enable = 0; + + \(aq0b\(aq / [01] :=> bin + \(dq0\(dq :=> oct + \(dq\(dq / [1\-9] :=> dec + \(aq0x\(aq / [0\-9a\-fA\-F] :=> hex + * { None } + + [01] { yyfnbin st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 2) } + [0\-7] { yyfnoct st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 8) } + [0\-9] { yyfndec st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 10) } + [0\-9] { yyfnhex st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 48) 16) } + [a\-f] { yyfnhex st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 87) 16) } + [A\-F] { yyfnhex st (add num (Char.code st.yyinput.[st.yycursor \- 1] \- 55) 16) } + + * { num } +%} + +let test (yyinput: string) (result: int option) = + let st = {yyinput = yyinput; yycursor = 0; yymarker = 0; yycond = YYC_init} in + if not (parse st (Some 0) = result) then raise (Failure \(dqerror\(dq) + +let main () = + test \(dq\ex00\(dq None; + test \(dq1234567890\ex00\(dq (Some 1234567890); + test \(dq0b1101\ex00\(dq (Some 13); + test \(dq0x7Fe\ex00\(dq (Some 2046); + test \(dq0644\ex00\(dq (Some 420); + test \(dq9999999999\ex00\(dq None + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-fi *) + +open Bytes + +(* Use a small buffer to cover the case when a lexeme doesn\(aqt fit. + In real world use a larger buffer. *) +let bufsize = 10 + +let debug = false +let log format = (if debug then Printf.eprintf else Printf.ifprintf stderr) format + +type state = { + file: in_channel; + yyinput: bytes; + mutable yycursor: int; + mutable yymarker: int; + mutable yylimit: int; + mutable token: int; + mutable yystate: int; + mutable recv: int; +} + +type status = End | Ready | Waiting | BadPacket | BigPacket + +let fill(st: state) : status = + (* Error: lexeme too long. In real life could reallocate a larger buffer. *) + if st.token < 1 then BigPacket else ( + + (* Shift buffer contents (discard everything up to the current token). *) + blit st.yyinput st.token st.yyinput 0 (st.yylimit \- st.token); + st.yycursor <\- st.yycursor \- st.token; + st.yymarker <\- st.yymarker \- st.token; + st.yylimit <\- st.yylimit \- st.token; + st.token <\- 0; + + (* Fill free space at the end of buffer with new data from file. *) + let n = In_channel.input st.file st.yyinput st.yylimit (bufsize \- st.yylimit \- 1) in + st.yylimit <\- st.yylimit + n; + set st.yyinput st.yylimit \(aq\ex00\(aq; (* append sentinel *) + + Ready) + +%{ + re2c:define:YYFN = [\(dqlex;status\(dq, \(dqyyrecord;state\(dq]; + re2c:define:YYFILL = \(dqWaiting\(dq; + re2c:eof = 0; + + packet = [a\-z]+[;]; + + * { BadPacket } + $ { End } + packet { yyrecord.recv <\- yyrecord.recv + 1; lex_loop yyrecord } +%} + +and lex_loop st = + st.token <\- st.yycursor; + lex st + +let test (packets: string list) (sts: status) = + let fname = \(dqpipe\(dq in + + let oc = Out_channel.open_bin fname in + let ic = In_channel.open_bin fname in + + let yylimit = bufsize \- 1 in + let st = { + file = ic; + (* Sentinel (at \(gayylimit\(ga offset) is set to null, which triggers YYFILL. *) + yyinput = create bufsize; + yycursor = yylimit; + yymarker = yylimit; + yylimit = yylimit; + token = yylimit; + yystate = \-1; + recv = 0; + } in + + let rec loop packets = match lex_loop st with + | End \-> + log \(dqdone: got %d packets\en\(dq st.recv; + End + | Waiting \-> + log \(dqwaiting...\en\(dq; + let packets\(aq = match packets with + | [] \-> [] + | p :: ps \-> + log \(dqsent packet \(aq%s\(aq\en\(dq p; + Out_channel.output_string oc p; + Out_channel.flush oc; (* without \(gaflush\(ga write happens too late *) + ps + in (match fill st with + | BigPacket \-> + log \(dqerror: packet too big\en\(dq; + BigPacket + | Ready \-> loop packets\(aq + | _ \-> raise (Failure \(dqunexpected status after fill\(dq)) + | BadPacket \-> + log \(dqerror: ill\-formed packet\en\(dq; + BadPacket + | _ \-> raise (Failure \(dqunexpected status\(dq) + + in if not (loop packets = sts) then + raise (Failure \(dqerror\(dq); + + In_channel.close ic; + Out_channel.close oc; + Sys.remove fname + +let main () = + test [] End; + test [\(dqzero;\(dq; \(dqone;\(dq; \(dqtwo;\(dq; \(dqthree;\(dq; \(dqfour;\(dq] End; + test [\(dqzer0;\(dq] BadPacket; + test [\(dqgoooooooooogle;\(dq] BigPacket + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) +(* This example shows how to combine reusable re2c blocks: two blocks + (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs + in both blocks; the \(aqfish\(aq block takes priority because it is used + earlier. Default rule * occurs in all three blocks; the local (not + inherited) definition takes priority. *) + +open String + +type answer = Color | Fish | Dunno + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; +} + +%{rules:colors + * { raise (Failure \(dqah\(dq); } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { Color } +%} + +%{rules:fish + * { raise (Failure \(dqoh\(dq); } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { Fish } +%} + +%{ + re2c:define:YYFN = [\(dqlex;answer\(dq, \(dqyyrecord;state\(dq]; + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { Dunno } // overrides inherited \(aq*\(aq rules +%} + +let test(str, ans) = + let st = {yyinput = str; yycursor = 0; yymarker = 0} + in if not (lex st = ans) then raise (Failure \(dqerror\(dq) + +let main () = + test(\(dqsalmon\(dq, Fish); + test(\(dqwhat?\(dq, Dunno) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-\-input\-encoding utf8 *) +(* This example supports multiple input encodings: UTF\-8 and UTF\-32. + Both lexers are generated from the same rules block, and the use + blocks add only encoding\-specific configurations. *) + +open Array + +type \(aqa state = { + yyinput: \(aqa array; + mutable yycursor: int; + mutable yymarker: int; +} + +%{rules + re2c:yyfill:enable = 0; + + \(dq∀x ∃y\(dq { Some yyrecord.yycursor } + * { None } +%} + +%{use + re2c:define:YYFN = [\(dqlex8;int option\(dq, \(dqyyrecord;char state\(dq]; + re2c:encoding:utf8 = 1; +%} + +%{use + re2c:define:YYFN = [\(dqlex32;int option\(dq, \(dqyyrecord;int state\(dq]; + re2c:encoding:utf32 = 1; +%} + +let main() = + let st8 = { + yyinput = [|\(aq\exe2\(aq; \(aq\ex08\(aq; \(aq\ex80\(aq; \(aq\ex78\(aq; \(aq\ex20\(aq; \(aq\exe2\(aq; \(aq\ex88\(aq; \(aq\ex83\(aq; \(aq\ex79\(aq|]; + yycursor = 0; + yymarker = 0; + } in if not (lex8 st8 = Some (Array.length st8.yyinput)) then raise (Failure \(dqerror\(dq); + + let st32 = { + yycursor = 0; + yymarker = 0; + yyinput = [|0x2200; 0x78; 0x20; 0x2203; 0x79|]; + } in if not (lex32 st32 = Some (Array.length st32.yyinput)) then raise (Failure \(dqerror\(dq); + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + (* Final tag variables available in semantic action. *) + %{svars format = \(dq\en\etmutable @@{tag}: int;\(dq; %} + (* Intermediate tag variables used by the lexer (must be autogenerated). *) + %{stags format = \(dq\en\etmutable @@{tag}: int;\(dq; %} +} + +type semver = { + major: int; + minor: int; + patch: int; +} + +let s2n (str: string) (i1: int) (i2: int) : int = + let rec f s i j n = + if i >= j then n else f s (i + 1) j (n * 10 + Char.code s.[i] \- 48) + in f str i1 i2 0 + +%{local + re2c:define:YYFN = [\(dqparse;semver option\(dq, \(dqst;state\(dq]; + re2c:variable:yyrecord = \(dqst\(dq; + re2c:tags = 1; + re2c:yyfill:enable = 0; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + Some { + major = s2n st.yyinput st.t1 st.t2; + minor = s2n st.yyinput st.t3 st.t4; + patch = if st.t5 = \-1 then 0 else s2n st.yyinput st.t5 (st.yycursor \- 1) + } + } + * { None } +%} + +let test (str: string) (result: semver option) = + let st = { + yyinput = str; + yycursor = 0; + yymarker = 0; + %{svars format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + %{stags format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + } + in if not (parse st = result) then raise (Failure \(dqerror\(dq) + +let main () = + test \(dq23.34\ex00\(dq (Some {major = 23; minor = 34; patch = 0}); + test \(dq1.2.99999\ex00\(dq (Some {major = 1; minor = 2; patch = 99999}); + test \(dq1.a\ex00\(dq None + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open Bytes + +let bufsize = 4096 + +type state = { + file: in_channel; + yyinput: bytes; + mutable yycursor: int; + mutable yymarker: int; + mutable yylimit: int; + mutable token: int; + mutable eof: bool; + (* Final tag variables available in semantic action. *) + %{svars format = \(dq\en\etmutable @@{tag}: int;\(dq; %} + (* Intermediate tag variables used by the lexer (must be autogenerated). *) + %{stags format = \(dq\en\etmutable @@{tag}: int;\(dq; %} +} + +type status = Ok | Eof | LongLexeme + +type semver = { + major: int; + minor: int; + patch: int; +} + +let s2n (str: bytes) (i1: int) (i2: int) : int = + let rec f s i j n = + if i >= j then n else f s (i + 1) j (n * 10 + Char.code (get s i) \- 48) + in f str i1 i2 0 + +let fill(st: state) : status = + if st.eof then Eof else + + (* Error: lexeme too long. In real life could reallocate a larger buffer. *) + if st.token < 1 then LongLexeme else ( + + (* Shift buffer contents (discard everything up to the current token). *) + blit st.yyinput st.token st.yyinput 0 (st.yylimit \- st.token); + st.yycursor <\- st.yycursor \- st.token; + st.yymarker <\- st.yymarker \- st.token; + st.yylimit <\- st.yylimit \- st.token; + %{stags format = \(dq\en\etst.@@ <\- if st.@@ = \-1 then \-1 else st.@@ \- st.token;\(dq; %} + st.token <\- 0; + + (* Fill free space at the end of buffer with new data from file. *) + let n = input st.file st.yyinput st.yylimit (bufsize \- st.yylimit \- 1) in (* \-1 for sentinel *) + st.yylimit <\- st.yylimit + n; + if n = 0 then + st.eof <\- true; (* end of file *) + set st.yyinput st.yylimit \(aq\ex00\(aq; (* append sentinel *) + + Ok) + +%{ + re2c:define:YYFN = [\(dqlex;(semver list) option\(dq, \(dqst;state\(dq, \(dqvers;semver list\(dq]; + re2c:define:YYFILL = \(dqfill st = Ok\(dq; + re2c:variable:yyrecord = \(dqst\(dq; + re2c:tags = 1; + re2c:eof = 0; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\en] { + let ver = { + major = s2n st.yyinput st.t1 st.t2; + minor = s2n st.yyinput st.t3 st.t4; + patch = if st.t5 = \-1 then 0 else s2n st.yyinput st.t5 (st.yycursor \- 1) + } in lex_loop st (ver :: vers) + } + $ { Some (List.rev vers) } + * { None } +%} + +and lex_loop st vers = + st.token <\- st.yycursor; + lex st vers + +let main () = + let fname = \(dqinput\(dq in + + (* Prepare input file. *) + Out_channel.with_open_bin fname + (fun oc \-> for i = 1 to bufsize do + output_string oc \(dq1.22.333\en\(dq + done); + + (* Construct the expected result to compare against. *) + let expect = Some (List.init bufsize + (fun _ \-> {major = 1; minor = 22; patch = 333;})) in + + (* Run lexer on the prepared file. *) + In_channel.with_open_bin fname + (fun ic \-> + let yylimit = bufsize \- 1 in + let st = { + file = ic; + yyinput = create bufsize; + yycursor = yylimit; + yymarker = yylimit; + yylimit = yylimit; + token = yylimit; + eof = false; + %{svars format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + %{stags format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + } in if (lex_loop st [] <> expect) then + raise (Failure \(dqerror\(dq)); + + (* Cleanup. *) + Sys.remove fname + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + (* Final tag variables available in semantic action. *) + %{svars format = \(dq\en\etmutable @@{tag}: int;\(dq; %} + (* Intermediate tag variables used by the lexer (must be autogenerated). *) + %{stags format = \(dq\en\etmutable @@{tag}: int;\(dq; %} +} + +type semver = { + major: int; + minor: int; + patch: int; +} + +let s2n (str: string) (i1: int) (i2: int) : int = + let rec f s i j n = + if i >= j then n else f s (i + 1) j (n * 10 + Char.code s.[i] \- 48) + in f str i1 i2 0 + +%{local + re2c:define:YYFN = [\(dqparse;semver option\(dq, \(dqst;state\(dq]; + re2c:variable:yyrecord = \(dqst\(dq; + re2c:captvars = 1; + re2c:yyfill:enable = 0; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + Some { + major = s2n st.yyinput st.yytl1 st.yytr1; + minor = s2n st.yyinput st.yytl2 st.yytr2; + patch = if st.yytl3 = \-1 then 0 else s2n st.yyinput (st.yytl3 + 1) st.yytr3 + } + } + * { None } +%} + +let test (str: string) (result: semver option) = + let st = { + yyinput = str; + yycursor = 0; + yymarker = 0; + %{svars format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + %{stags format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + } + in if not (parse st = result) then raise (Failure \(dqerror\(dq) + +let main () = + test \(dq23.34\ex00\(dq (Some {major = 23; minor = 34; patch = 0}); + test \(dq1.2.99999\ex00\(dq (Some {major = 1; minor = 2; patch = 99999}); + test \(dq1.a\ex00\(dq None + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT *) + +open String + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + (* Final tag variables available in semantic action. *) + %{svars format = \(dq\en\etmutable @@{tag}: int;\(dq; %} + %{mvars format = \(dq\en\etmutable @@{tag}: int list;\(dq; %} + (* Intermediate tag variables used by the lexer (must be autogenerated). *) + %{stags format = \(dq\en\etmutable @@{tag}: int;\(dq; %} + %{mtags format = \(dq\en\etmutable @@{tag}: int list;\(dq; %} +} + +let s2n (str: string) (i1: int) (i2: int) : int = + let rec f s i j n = + if i >= j then n else f s (i + 1) j (n * 10 + Char.code s.[i] \- 48) + in f str i1 i2 0 + +%{local + re2c:define:YYFN = [\(dqparse;(int list) option\(dq, \(dqst;state\(dq]; + re2c:define:YYMTAGP = \(dq@@ <\- st.yycursor :: @@;\(dq; + re2c:define:YYMTAGN = \(dq\(dq; // alternatively could add \(ga\-1\(ga to the list + re2c:variable:yyrecord = \(dqst\(dq; + re2c:tags = 1; + re2c:yyfill:enable = 0; + + num = [0\-9]+; + + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + let x = s2n st.yyinput st.t1 st.t2 in + let xs = List.rev (List.map2 (fun x y \-> s2n st.yyinput x y) st.t3 st.t4) in + Some (x :: xs) + } + * { None } +%} + +let test (str: string) (result: (int list) option) = + let st = { + yyinput = str; + yycursor = 0; + yymarker = 0; + %{svars format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + %{mvars format = \(dq\en\et\et@@{tag} = [];\(dq; %} + %{stags format = \(dq\en\et\et@@{tag} = \-1;\(dq; %} + %{mtags format = \(dq\en\et\et@@{tag} = [];\(dq; %} + } + in if not (parse st = result) then raise (Failure \(dqerror\(dq) + +let main () = + test \(dq1\ex00\(dq (Some [1]); + test \(dq1.2.3.4.5.6.7\ex00\(dq (Some [1; 2; 3; 4; 5; 6; 7;]); + test \(dq1.2.\ex00\(dq None + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-\-utf8 \-i *) + +open String + +%{include \(dqunicode_categories.re\(dq %} + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + mutable yyaccept: int; +} + +%{ + re2c:define:YYFN = [\(dqlex;bool\(dq, \(dqyyrecord;state\(dq]; + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + + identifier { true } + * { false } +%} + +let main () = + let st = { + yyinput = \(dq_Ыдентификатор\ex00\(dq; + yycursor = 0; + yymarker = 0; + yyaccept = 0; + } + in if not (lex st) then raise (Failure \(dqerror\(dq) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.ml) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +type number = Int | Float | NaN + +%{ + number = [1\-9][0\-9]*; +%} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { Float } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-i *) + +open String + +%{include \(dqdefinitions.ml\(dq %} + +type state = { + yyinput: string; + mutable yycursor: int; + mutable yymarker: int; + mutable yyaccept: int; +} + +%{ + re2c:define:YYFN = [\(dqlex;number\(dq, \(dqyyrecord;state\(dq]; + re2c:yyfill:enable = 0; + + * { NaN } + number { Int } + !include \(dqextra_rules.re.inc\(dq; +%} + +let test(str, num) = + let st = {yyinput = str; yycursor = 0; yymarker = 0; yyaccept = 0} + in if not (lex st = num) then raise (Failure \(dqerror\(dq) + +let main () = + test(\(dq123\ex00\(dq, Int); + test(\(dq123.4567\ex00\(dq, Float) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* re2ocaml $INPUT \-o $OUTPUT \-\-header lexer/state.ml \-i *) + +open State +open String + +%{header:on %} +type state = { + yyinput: string; + mutable yycursor: int; + mutable tag: int; + %{stags format = \(dqmutable @@: int;\(dq; %} +} +%{header:off %} + +%{ + re2c:define:YYFN = [\(dqlex;int\(dq, \(dqyyrecord;State.state\(dq]; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:header = \(dqlexer/state.ml\(dq; + + [a]* @tag [b]* { yyrecord.tag } +%} + +let main () = + let st = { + yyinput = \(dqab\ex00\(dq; + yycursor = 0; + tag = 0; + %{stags format = \(dq\en\et@@ = 0;\(dq; %} + } + in if not (lex st = 1) then raise (Failure \(dqerror\(dq) + +let _ = main () + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +(* Generated by re2c *) + +type state = { + yyinput: string; + mutable yycursor: int; + mutable tag: int; + +mutable yyt1: int; +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/doc/re2rust.1 b/bootstrap/doc/re2rust.1 index a77e627a1..ab99a837d 100644 --- a/bootstrap/doc/re2rust.1 +++ b/bootstrap/doc/re2rust.1 @@ -250,8 +250,8 @@ program: .TP .B \fBSimple API\fP (\fIadded in version 4.0\fP) -This is a basic API that can be enabled with option \fB\-\-api simple\fP or -configuration \fBre2c:api = simple\fP\&. It consists of the following +This is a basic API that can be enabled with \fB\-\-api simple\fP option or +\fBre2c:api = simple\fP configuration. It consists of the following primitives: \fBYYINPUT\fP (which should be defined as a sequence of code units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, \fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). @@ -263,8 +263,8 @@ units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, .B \fBRecord API\fP (\fIadded in version 4.0\fP) Record API is useful in cases when lexer state must be stored in a struct. -It is enabled with option \fB\-\-api record\fP or configuration -\fBre2c:api = record\fP\&. This API consists of a variable \fByyrecord\fP (the +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the name can be overridden with \fBre2c:variable:yyrecord\fP) that should be defined as a struct with fields \fByyinput\fP, \fByycursor\fP, \fByymarker\fP, \fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code @@ -275,9 +275,7 @@ need to be defined, and their names can be configured). .sp .TP .B \fBGeneric API\fP -(\fIadded in version 0.14\fP) -This is the default API for the Rust backend. It is enabled with -\fB\-\-api generic\fP option or \fBre2c:api = generic\fP configuration. +This is the most flexible API and the default API for the Rust backend. This API contains primitives for generic operations: \fBYYPEEK\fP, \fBYYSKIP\fP, @@ -2804,53 +2802,64 @@ fn main() { .SH SUBMATCH EXTRACTION .sp re2c has two options for submatch extraction. -.sp -The first option is \fB\-T \-\-tags\fP\&. With this option one can use standalone tags -of the form \fB@stag\fP and \fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary -used\-defined names. Tags can be used anywhere inside of a regular expression; -semantically they are just position markers. Tags of the form \fB@stag\fP are -called s\-tags: they denote a single submatch value (the last input position -where this tag matched). Tags of the form \fB#mtag\fP are called m\-tags: they -denote multiple submatch values (the whole history of repetitions of this tag). -All tags should be defined by the user as variables with the corresponding -names. With standalone tags re2c uses leftmost greedy disambiguation: submatch -positions correspond to the leftmost matching path through the regular -expression. -.sp -The second option is \fB\-P \-\-posix\-captures\fP: it enables POSIX\-compliant -capturing groups. In this mode parentheses in regular expressions denote the -beginning and the end of capturing groups; the whole regular expression is group -number zero. The number of groups for the matching rule is stored in a variable -\fByynmatch\fP, and submatch results are stored in \fByypmatch\fP array. Both -\fByynmatch\fP and \fByypmatch\fP should be defined by the user, and \fByypmatch\fP -size must be at least \fB[yynmatch * 2]\fP\&. re2c provides a directive -\fB/*!maxnmatch:re2c*/\fP that defines \fBYYMAXNMATCH\fP: a constant equal to the -maximal value of \fByynmatch\fP among all rules. Note that re2c implements -POSIX\-compliant disambiguation: each subexpression matches as long as possible, -and subexpressions that start earlier in regular expression have priority over -those starting later. Capturing groups are translated into s\-tags under the -hood, therefore we use the word \(dqtag\(dq to describe them as well. -.sp -With both \fB\-P \-\-posix\-captures\fP and \fBT \-\-tags\fP options re2c uses efficient -submatch extraction algorithm described in the -\fI\%Tagged Deterministic Finite Automata with Lookahead\fP -paper. The overhead on submatch extraction in the generated lexer grows with the -number of tags \-\-\- if this number is moderate, the overhead is barely -noticeable. In the lexer tags are implemented using a number of tag variables -generated by re2c. There is no one\-to\-one correspondence between tag variables -and tags: a single variable may be reused for different tags, and one tag may -require multiple variables to hold all its ambiguous values. Eventually -ambiguity is resolved, and only one final variable per tag survives. When a rule -matches, all its tags are set to the values of the corresponding tag variables. -The exact number of tag variables is unknown to the user; this number is -determined by re2c. However, tag variables should be defined by the user as a -part of the lexer state and updated by \fBYYFILL\fP, therefore re2c provides -directives \fB/*!stags:re2c*/\fP and \fB/*!mtags:re2c*/\fP that can be used to -declare, initialize and manipulate tag variables. These directives have two -optional configurations: \fBformat = \(dq@@\(dq;\fP (specifies the template where \fB@@\fP -is substituted with the name of each tag variable), and \fBseparator = \(dq\(dq;\fP -(specifies the piece of code used to join the generated pieces for different -tag variables). +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. .sp S\-tags support the following operations: .INDENT 0.0 @@ -3102,7 +3111,7 @@ fn main() { .UNINDENT .UNINDENT .sp -Here is an example of using POSIX capturing groups to parse semantic versions. +Here is an example of using capturing groups to parse semantic versions. .INDENT 0.0 .INDENT 3.5 .sp @@ -3110,9 +3119,6 @@ Here is an example of using POSIX capturing groups to parse semantic versions. .ft C // re2rust $INPUT \-o $OUTPUT -// Maximum number of capturing groups among all rules. -/*!maxnmatch:re2c*/ - #[derive(Debug, PartialEq)] struct SemVer(u32, u32, u32); // version: (major, minor, patch) @@ -3129,32 +3135,25 @@ fn parse(yyinput: &[u8]) \-> Option { let (mut yycursor, mut yymarker) = (0, 0); - // Allocate memory for capturing parentheses (twice the number of groups). - let yynmatch: usize; - let mut yypmatch = [0; YYMAXNMATCH*2]; + // Final tag variables available in semantic action. + /*!stags:re2c format = \(aqlet mut @@ = NONE;\(aq; */ // Intermediate tag variables used by the lexer (must be autogenerated). - /*!stags:re2c format = \(aqlet mut @@ = NONE;\(aq; */ + /*!svars:re2c format = \(aq#[allow(unused_mut)]\enlet mut @@;\en\(aq; */ /*!re2c re2c:api = default; re2c:define:YYCTYPE = u8; re2c:yyfill:enable = 0; - re2c:posix\-captures = 1; + re2c:captvars = 1; num = [0\-9]+; (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { - // \(gayynmatch\(ga is the number of capturing groups - assert_eq!(yynmatch, 4); - - // Even \(gayypmatch\(ga values are for opening parentheses, odd values - // are for closing parentheses, the first group is the whole match. - let major = s2n(&yyinput[yypmatch[2]..yypmatch[3]]); - let minor = s2n(&yyinput[yypmatch[4]..yypmatch[5]]); - let patch = if yypmatch[6] == NONE {0} - else {s2n(&yyinput[yypmatch[6] + 1..yypmatch[7]])}; - + assert!(yytl0 == 0 && yytr0 == yyinput.len()); + let major = s2n(&yyinput[yytl1..yytr1]); + let minor = s2n(&yyinput[yytl2..yytr2]); + let patch = if yytl3 == NONE {0} else {s2n(&yyinput[yytl3 + 1..yytr3])}; return Some(SemVer(major, minor, patch)); } * { return None; } diff --git a/bootstrap/doc/re2v.1 b/bootstrap/doc/re2v.1 new file mode 100644 index 000000000..5506ba0b7 --- /dev/null +++ b/bootstrap/doc/re2v.1 @@ -0,0 +1,3488 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for V, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-i + +fn lex(yyinput string) { + mut yycursor := 0 + /*!re2c + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + + number { return } + * { panic(\(dqerror!\(dq) } + */ +} + +fn main() { + lex(\(dq1234\ex00\(dq) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Code generated by re2v, DO NOT EDIT. +// re2v $INPUT \-o $OUTPUT \-i + +fn lex(yyinput string) { + mut yycursor := 0 + + mut yych := 0 + yych = yyinput[yycursor] + match yych { + 0x31...0x39 { unsafe { goto yy2 } } + else { unsafe { goto yy1 } } + } +yy1: + yycursor += 1 + panic(\(dqerror!\(dq) +yy2: + yycursor += 1 + yych = yyinput[yycursor] + match yych { + 0x30...0x39 { unsafe { goto yy2 } } + else { unsafe { goto yy3 } } + } +yy3: + return + +} + +fn main() { + lex(\(dq1234\ex00\(dq) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2v has three API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBSimple API\fP +This is the default API for the V backend. It consists of the following +primitives: \fBYYINPUT\fP (which should be defined as a sequence of code +units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, +\fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). +.nf + +.fi +.sp +.TP +.B \fBRecord API\fP +Record API is useful in cases when lexer state must be stored in a struct. +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the +name can be overridden with \fBre2c:variable:yyrecord\fP) that should be +defined as a struct with fields \fByyinput\fP, \fByycursor\fP, \fByymarker\fP, +\fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code +need to be defined, and their names can be configured). +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +// Expect a null\-terminated string. +fn lex(yyinput string) int { + mut yycursor := 0 + mut count := 0 + +loop: /*!re2c + re2c:yyfill:enable = 0; + + * { return \-1 } + [\ex00] { return count } + [a\-z]+ { count += 1; unsafe { goto loop } } + [ ]+ { unsafe { goto loop } } + */ +} + +fn main() { + assert lex(\(dq\e0\(dq) == 0 + assert lex(\(dqone two three\e0\(dq) == 3 + assert lex(\(dqf0ur\e0\(dq) == \-1 +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +// Expects a null\-terminated string. +fn lex(yyinput string) int { + mut yycursor, mut yymarker := 0, 0 + yylimit := yyinput.len \- 1 // yylimit points at the terminating null + mut count := 0 + +loop: /*!re2c + re2c:eof = 0; + re2c:yyfill:enable = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1 } + $ { return count } + str { count += 1; unsafe { goto loop } } + [ ]+ { unsafe { goto loop } } + + */ +} + +fn main() { + assert lex(\(dq\e0\(dq) == 0 + assert lex(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \e0\(dq) == 3 + assert lex(\(dq\(aqunterminated\e\e\(aq\e0\(dq) == \-1 +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +/*!max:re2c*/ + +// Expects yymaxfill\-padded string. +fn lex(str string) int { + // Pad string with yymaxfill zeroes at the end. + mut yyinput := []u8{len: str.len + yymaxfill} + copy(mut &yyinput, str.bytes()) + + mut yycursor := 0 + yylimit := yyinput.len + mut count := 0 + +loop: /*!re2c + re2c:define:YYFILL = \(dqreturn \-1\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + if yycursor \- 1 == str.len { return count } else { return \-1 } + } + str { count += 1; unsafe { goto loop } } + [ ]+ { unsafe { goto loop } } + * { return \-1 } + + */ +} + +fn main() { + assert lex(\(dq\(dq) == 0 + assert lex(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq) == 3 + assert lex(\(dq\(aqunterminated\e\e\(aq\(dq) == \-1 + assert lex(\(dq\(aqunexpected \e00 null\e\e\(aq\(dq) == \-1 +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +// Returns \(dqfake\(dq terminating null if cursor has reached limit. +fn peek(str string, cur int) u8 { + return if cur >= str.len { u8(0) } /* fake null */ else { return str[cur] } +} + +// Expects a string without terminating null. +fn lex(str string) int { + mut cur := 0 + mut count := 0 + +loop: /*!re2c + re2c:api = generic; + re2c:yyfill:enable = 0; + re2c:define:YYPEEK = \(dqpeek(str, cur)\(dq; + re2c:define:YYSKIP = \(dqcur += 1\(dq; + + * { return \-1 } + [\ex00] { return count } + [a\-z]+ { count += 1; unsafe { goto loop } } + [ ]+ { unsafe { goto loop } } + + */ +} + +fn main() { + assert lex(\(dq\(dq) == 0 + assert lex(\(dqone two three\(dq) == 3 + assert lex(\(dqf0ur\(dq) == \-1 +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +import os +import strings + +const bufsize = 4096 + +struct State { + file os.File +mut: + yyinput []u8 + yycursor int + yymarker int + yylimit int + token int + eof bool +} + +fn fill(mut st &State) int { + if st.eof { return \-1 } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if st.token < 1 { return \-2 } + + // Shift buffer contents (discard everything up to the current token). + copy(mut &st.yyinput, st.yyinput[st.token..st.yylimit]) + st.yycursor \-= st.token + st.yymarker \-= st.token + st.yylimit \-= st.token + st.token = 0 + + // Fill free space at the end of buffer with new data from file. + pos := st.file.tell() or { 0 } + if n := st.file.read_bytes_into(u64(pos), mut st.yyinput[st.yylimit..bufsize]) { + st.yylimit += n + } + st.yyinput[st.yylimit] = 0 // append sentinel symbol + + // If read less than expected, this is the end of input. + st.eof = st.yylimit < bufsize + + return 0 +} + +fn lex(mut yyrecord &State) int { + mut count := 0 +loop: + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:eof = 0; + re2c:define:YYFILL = \(dqfill(mut yyrecord) == 0\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1 } + $ { return count } + str { count += 1; unsafe { goto loop } } + [ ]+ { unsafe { goto loop } } + */ +} + +fn main() { + fname := \(dqinput\(dq + content := \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + mut fw := os.create(fname)! + fw.write_string(strings.repeat_string(content, bufsize))! + fw.close() + count := 3 * bufsize // number of quoted strings written to file + + // Prepare lexer state: all offsets are at the end of buffer. + mut fr := os.open(fname)! + mut st := &State{ + file: fr, + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + yyinput: []u8{len: bufsize + 1}, + yycursor: bufsize, + yymarker: bufsize, + yylimit: bufsize, + token: bufsize, + eof: false, + } + + // Run the lexer. + n := lex(mut st) + if n != count { panic(\(dqexpected $count, got $n\(dq) } + + // Cleanup: remove input file. + fr.close() + os.rm(fname)! +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +import os +import strings + +/*!max:re2c*/ +const bufsize = 4096 + +struct State { + file os.File +mut: + yyinput []u8 + yycursor int + yylimit int + token int + eof bool +} + +fn fill(mut st &State, need int) int { + if st.eof { return \-1 } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if st.token < need { return \-2 } + + // Shift buffer contents (discard everything up to the current token). + copy(mut &st.yyinput, st.yyinput[st.token..st.yylimit]) + st.yycursor \-= st.token + st.yylimit \-= st.token + st.token = 0 + + // Fill free space at the end of buffer with new data from file. + pos := st.file.tell() or { 0 } + if n := st.file.read_bytes_into(u64(pos), mut st.yyinput[st.yylimit..bufsize]) { + st.yylimit += n + } + + // If read less than expected, this is the end of input. + if st.yylimit < bufsize { + st.eof = true + for i := 0; i < yymaxfill; i += 1 { st.yyinput[st.yylimit + i] = 0 } + st.yylimit += yymaxfill + } + + return 0 +} + +fn lex(mut yyrecord &State) int { + mut count := 0 +loop: + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:define:YYFILL = \(dqr := fill(mut yyrecord, @@); if r != 0 { return r }\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return if yyrecord.token == (yyrecord.yylimit \- yymaxfill) { count } else { \-1 } + } + str { count += 1; unsafe { goto loop } } + [ ]+ { unsafe { goto loop } } + * { return \-1 } + */ +} + +fn main() { + fname := \(dqinput\(dq + content := \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + mut fw := os.create(fname)! + fw.write_string(strings.repeat_string(content, bufsize))! + fw.close() + count := 3 * bufsize // number of quoted strings written to file + + // Prepare lexer state: all offsets are at the end of buffer. + // This immediately triggers YYFILL, as the YYLESSTHAN condition is true. + mut fr := os.open(fname)! + mut st := &State{ + file: fr, + yyinput: []u8{len: bufsize + yymaxfill}, + yycursor: bufsize, + yylimit: bufsize, + token: bufsize, + eof: false, + } + + // Run the lexer. + n := lex(mut st) + if n != count { panic(\(dqexpected $count, got $n\(dq) } + + // Cleanup: remove input file. + fr.close() + os.rm(fname)! +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-i + +const u32_lim = u64(1) << 32 + +fn parse_u32(yyinput string) ?u32 { + mut yycursor, mut yymarker := 0, 0 + mut n := u64(0) + mut yych := 0 + + adddgt := fn (num u64, base u64, digit u8) u64 { + n := num * base + u64(digit) + return if n >= u32_lim { u32_lim } else { n } + } + /*!re2c + re2c:yyfill:enable = 0; + re2c:variable:yych:emit = 0; + + end = \(dq\ex00\(dq; + + \(aq0b\(aq / [01] { unsafe{ goto bin } } + \(dq0\(dq { unsafe{ goto oct } } + \(dq\(dq / [1\-9] { unsafe{ goto dec } } + \(aq0x\(aq / [0\-9a\-fA\-F] { unsafe{ goto hex } } + * { return none } + */ +bin: + /*!re2c + end { unsafe{ goto end } } + [01] { n = adddgt(n, 2, yyinput[yycursor\-1] \- 48); unsafe{ goto bin } } + * { return none } + */ +oct: + /*!re2c + end { unsafe{ goto end } } + [0\-7] { n = adddgt(n, 8, yyinput[yycursor\-1] \- 48); unsafe{ goto oct } } + * { return none } + */ +dec: + /*!re2c + end { unsafe{ goto end } } + [0\-9] { n = adddgt(n, 10, yyinput[yycursor\-1] \- 48); unsafe{ goto dec } } + * { return none } + */ +hex: + /*!re2c + end { unsafe{ goto end } } + [0\-9] { n = adddgt(n, 16, yyinput[yycursor\-1] \- 48); unsafe{ goto hex } } + [a\-f] { n = adddgt(n, 16, yyinput[yycursor\-1] \- 87); unsafe{ goto hex } } + [A\-F] { n = adddgt(n, 16, yyinput[yycursor\-1] \- 55); unsafe{ goto hex } } + * { return none } + */ +end: + if n < u32_lim { + return u32(n) + } + return none +} + +fn main() { + test := fn (num ?u32, str string) { + if n := parse_u32(str) { + if m := num { if n != m { panic(\(dqwrong number\(dq) } } + } else { + if _ := num { panic(\(dqexpected none\(dq) } + } + } + test(1234567890, \(dq1234567890\e0\(dq) + test(13, \(dq0b1101\e0\(dq) + test(0x7fe, \(dq0x007Fe\e0\(dq) + test(0o644, \(dq0644\e0\(dq) + test(none, \(dq9999999999\e0\(dq) + test(none, \(dq123??\e0\(dq) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-ci + +/*!conditions:re2c*/ + +const u32_lim = u64(1) << 32 + +fn parse_u32(yyinput string) ?u32 { + mut yycursor, mut yymarker := 0, 0 + mut n := u64(0) + mut yycond := YYCONDTYPE.yycinit + + adddgt := fn (num u64, base u64, digit u8) u64 { + n := num * base + u64(digit) + return if n >= u32_lim { u32_lim } else { n } + } + + /*!re2c + re2c:yyfill:enable = 0; + + <*> * { return none } + + \(aq0b\(aq / [01] :=> bin + \(dq0\(dq :=> oct + \(dq\(dq / [1\-9] :=> dec + \(aq0x\(aq / [0\-9a\-fA\-F] :=> hex + + \(dq\ex00\(dq { return if n < u32_lim { u32(n) } else { none } } + + [01] { n = adddgt(n, 2, yyinput[yycursor\-1] \- 48); unsafe{ goto yyc_bin } } + [0\-7] { n = adddgt(n, 8, yyinput[yycursor\-1] \- 48); unsafe{ goto yyc_oct } } + [0\-9] { n = adddgt(n, 10, yyinput[yycursor\-1] \- 48); unsafe{ goto yyc_dec } } + [0\-9] { n = adddgt(n, 16, yyinput[yycursor\-1] \- 48); unsafe{ goto yyc_hex } } + [a\-f] { n = adddgt(n, 16, yyinput[yycursor\-1] \- 87); unsafe{ goto yyc_hex } } + [A\-F] { n = adddgt(n, 16, yyinput[yycursor\-1] \- 55); unsafe{ goto yyc_hex } } + */ +} + +fn main() { + test := fn (num ?u32, str string) { + if n := parse_u32(str) { + if m := num { if n != m { panic(\(dqwrong number\(dq) } } + } else { + if _ := num { panic(\(dqexpected none\(dq) } + } + } + test(1234567890, \(dq1234567890\e0\(dq) + test(13, \(dq0b1101\e0\(dq) + test(0x7fe, \(dq0x007Fe\e0\(dq) + test(0o644, \(dq0644\e0\(dq) + test(none, \(dq9999999999\e0\(dq) + test(none, \(dq123??\e0\(dq) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v \-f $INPUT \-o $OUTPUT + +import log +import os + +// Use a small buffer to cover the case when a lexeme doesn\(aqt fit. +// In real world use a larger buffer. +const bufsize = 10 + +struct State { +mut: + file os.File + yyinput []u8 + yycursor int + yymarker int + yylimit int + token int + yystate int +} + +enum Status { + lex_end + lex_ready + lex_waiting + lex_bad_packet + lex_big_packet +} + +fn fill(mut st &State) Status { + shift := st.token + used := st.yylimit \- st.token + free := bufsize \- used + + // Error: no space. In real life can reallocate a larger buffer. + if free < 1 { return .lex_big_packet } + + // Shift buffer contents (discard already processed data). + copy(mut &st.yyinput, st.yyinput[shift..shift+used]) + st.yycursor \-= shift + st.yymarker \-= shift + st.yylimit \-= shift + st.token \-= shift + + // Fill free space at the end of buffer with new data. + pos := st.file.tell() or { 0 } + if n := st.file.read_bytes_into(u64(pos), mut st.yyinput[st.yylimit..bufsize]) { + st.yylimit += n + } + st.yyinput[st.yylimit] = 0 // append sentinel symbol + + return .lex_ready +} + +fn lex(mut yyrecord &State, mut recv &int) Status { + mut yych := u8(0) + /*!getstate:re2c*/ +loop: + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:eof = 0; + re2c:define:YYFILL = \(dqreturn .lex_waiting\(dq; + + packet = [a\-z]+[;]; + + * { return .lex_bad_packet } + $ { return .lex_end } + packet { recv += 1; unsafe{ goto loop } } + */ +} + +fn test(expect Status, packets []string) { + // Create a pipe (open the same file for reading and writing). + fname := \(dqpipe\(dq + mut fw := os.create(fname) or { panic(\(dqcannot create file\(dq) } + mut fr := os.open(fname) or { panic(\(dqcannot open file\(dq) } + + // Initialize lexer state: \(gastate\(ga value is \-1, all offsets are at the end + // of buffer. + mut st := &State{ + file: fr, + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + yyinput: []u8{len: bufsize + 1}, + yycursor: bufsize, + yymarker: bufsize, + yylimit: bufsize, + token: bufsize, + yystate: \-1, + } + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + mut status := Status.lex_ready + mut send := 0 + mut recv := 0 + for { + status = lex(mut st, mut &recv) + if status == .lex_end { + break + } else if status == .lex_waiting { + if send < packets.len { + log.debug(\(dqsending packet $send\(dq) + fw.write_string(packets[send]) or { panic(\(dqcannot write to file\(dq) } + fw.flush() + send += 1 + } + status = fill(mut st) + log.debug(\(dqfilled buffer $st.yyinput, status $status\(dq) + if status != .lex_ready { + break + } + } else if status == .lex_bad_packet { + break + } + } + + // Check results. + if status != expect || (status == .lex_end && recv != send) { + panic(\(dqexpected $expect with $send packet(s), got $status with $recv packet(s)\(dq) + } + + // Cleanup: remove input file. + fr.close() + fw.close() + os.rm(fname) or { panic(\(dqcannot remove file\(dq) } +} + +fn main() { + //log.set_level(.debug) + + test(.lex_end, []) + test(.lex_end, [\(dqzero;\(dq, \(dqone;\(dq, \(dqtwo;\(dq, \(dqthree;\(dq, \(dqfour;\(dq]) + test(.lex_bad_packet, [\(dq??;\(dq]) + test(.lex_big_packet, [\(dqlooooooooooooong;\(dq]) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +// in both blocks; the \(aqfish\(aq block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +enum What { + color + fish + dunno +} + +/*!rules:re2c:colors + * { panic(\(dqeh!\(dq) } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { return .color } +*/ + +/*!rules:re2c:fish + * { panic(\(dqoh!\(dq) } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { return .fish } +*/ + +fn lex(yyinput string) What { + mut yycursor, mut yymarker := 0, 0 + /*!re2c + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { return .dunno } // overrides inherited \(aq*\(aq rules + */ +} + +fn main() { + assert lex(\(dqsalmon\(dq) == .fish + assert lex(\(dqwhat?\(dq) == .dunno +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-\-input\-encoding utf8 + +// This example supports multiple input encodings: UTF\-8 and UTF\-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding\-specific configurations. +/*!rules:re2c + re2c:yyfill:enable = 0; + + \(dq∀x ∃y\(dq { return 0 } + * { return 1 } +*/ + +fn lex_utf8(yyinput []u8) int { + mut yycursor, mut yymarker := 0, 0 + /*!use:re2c + re2c:encoding:utf8 = 1; + re2c:define:YYCTYPE = u8; // the default + */ +} + +fn lex_utf32(yyinput []u32) int { + mut yycursor, mut yymarker := 0, 0 + /*!use:re2c + re2c:encoding:utf32 = 1; + re2c:define:YYCTYPE = u32; + */ +} + +fn main() { + s8 := [u8(0xe2), u8(0x88), u8(0x80), u8(0x78), u8(0x20), u8(0xe2), u8(0x88), u8(0x83), u8(0x79)] + s32 := [u32(0x2200), u32(0x78), u32(0x20), u32(0x2203), u32(0x79)] + assert lex_utf8(s8) == 0 + assert lex_utf32(s32) == 0 +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +struct SemVer { + major int + minor int + patch int +} + +fn s2n(s string) int { // convert pre\-parsed string to number + mut n := 0 + for c in s { n = n * 10 + int(c \- 48) } + return n +} + +fn parse(yyinput string) ?SemVer { + mut yycursor, mut yymarker := 0, 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(aqmut @@ := 0\en\(aq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(aqmut @@ := \-1\en\(aq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + return SemVer{ + major: s2n(yyinput[t1..t2]), + minor: s2n(yyinput[t3..t4]), + patch: if t5 == \-1 { 0 } else { s2n(yyinput[t5..yycursor \- 1]) } + } + } + * { return none } + */ +} + +fn main() { + test := fn (result ?SemVer, expect ?SemVer) { + if r := result { + if e := expect { if r != e { panic(\(dqexpected $e, got $r\(dq) } } + } else { + if _ := result { panic(\(dqexpected none\(dq) } + } + } + test(parse(\(dq23.34\e0\(dq), SemVer{23, 34, 0}) + test(parse(\(dq1.2.9999\e0\(dq), SemVer{1, 2, 9999}) + test(parse(\(dq1.a\e0\(dq), none) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +import arrays +import os +import strings + +const bufsize = 4096 +const tag_none = \-1 + +struct State { + file os.File +mut: + yyinput []u8 + yycursor int + yymarker int + yylimit int + token int + // Intermediate tag variables must be part of the lexer state passed to YYFILL. + // They don\(aqt correspond to tags and should be autogenerated by re2c. + /*!stags:re2c format = \(dq\et@@ int\en\(dq; */ + eof bool +} + +struct SemVer { + major int + minor int + patch int +} + +fn s2n(s []u8) int { // convert pre\-parsed string to number + mut n := 0 + for c in s { n = n * 10 + int(c \- 48) } + return n +} + +fn fill(mut st &State) int { + if st.eof { return \-1 } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if st.token < 1 { return \-2 } + + // Shift buffer contents (discard everything up to the current token). + copy(mut &st.yyinput, st.yyinput[st.token..st.yylimit]) + st.yycursor \-= st.token + st.yymarker \-= st.token + st.yylimit \-= st.token + // Tag variables need to be shifted like other input positions. The check + // for \-1 is only needed if some tags are nested inside of alternative or + // repetition, so that they can have \-1 value. + /*!stags:re2c format = \(dq\etif st.@@ != \-1 { st.@@ \-= st.token }\en\(dq; */ + st.token = 0 + + // Fill free space at the end of buffer with new data from file. + pos := st.file.tell() or { 0 } + if n := st.file.read_bytes_into(u64(pos), mut st.yyinput[st.yylimit..bufsize]) { + st.yylimit += n + } + st.yyinput[st.yylimit] = 0 // append sentinel symbol + + // If read less than expected, this is the end of input. + st.eof = st.yylimit < bufsize + + return 0 +} + +fn parse(mut st &State) ?[]SemVer { + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqmut @@ := tag_none\en\(dq; */ + + mut vers := []SemVer{} +loop: + st.token = st.yycursor + /*!re2c + re2c:api = record; + re2c:variable:yyrecord = st; + re2c:define:YYFILL = \(dqfill(mut st) == 0\(dq; + re2c:tags = 1; + re2c:eof = 0; + + num = [0\-9]+; + + num @t1 \(dq.\(dq @t2 num @t3 (\(dq.\(dq @t4 num)? [\en] { + ver := SemVer { + major: s2n(st.yyinput[st.token..t1]), + minor: s2n(st.yyinput[t2..t3]), + patch: if t4 == \-1 { 0 } else { s2n(st.yyinput[t4..st.yycursor \- 1]) } + } + vers = arrays.concat(vers, ver) + unsafe { goto loop } + } + $ { return vers } + * { return none } + */ +} + +fn main() { + fname := \(dqinput\(dq + content := \(dq1.22.333\en\(dq; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + mut fw := os.create(fname)! + fw.write_string(strings.repeat_string(content, bufsize))! + fw.close() + + // Prepare lexer state: all offsets are at the end of buffer. + mut fr := os.open(fname)! + mut st := &State{ + file: fr, + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + yyinput: []u8{len: bufsize + 1}, + yycursor: bufsize, + yymarker: bufsize, + yylimit: bufsize, + token: bufsize, + eof: false, + } + + // Run the lexer. + expect := []SemVer{len: bufsize, init: SemVer{1, 22, 333}} + result := parse(mut st) or { panic(\(dqparse failed\(dq) } + if result != expect { panic(\(dqerror\(dq) } + + // Cleanup: remove input file. + fr.close() + os.rm(fname)! +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +struct SemVer { + major int + minor int + patch int +} + +fn s2n(s string) int { // convert pre\-parsed string to number + mut n := 0 + for c in s { n = n * 10 + int(c \- 48) } + return n +} + +fn parse(yyinput string) ?SemVer { + mut yycursor, mut yymarker := 0, 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(aqmut @@ := 0\en\(aq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(aqmut @@ := 0\en\(aq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:captvars = 1; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + _ := yytl0; _ := yytr0 // some variables are unused + return SemVer { + major: s2n(yyinput[yytl1..yytr1]), + minor: s2n(yyinput[yytl2..yytr2]), + patch: if yytl3 == \-1 {0} else {s2n(yyinput[yytl3 + 1..yytr3])} + } + } + * { return none } + */ +} + +fn main() { + test := fn (result ?SemVer, expect ?SemVer) { + if r := result { + if e := expect { if r != e { panic(\(dqexpected $e, got $r\(dq) } } + } else { + if _ := result { panic(\(dqexpected none\(dq) } + } + } + test(parse(\(dq23.34\e0\(dq), SemVer{23, 34, 0}) + test(parse(\(dq1.2.9999\e0\(dq), SemVer{1, 2, 9999}) + test(parse(\(dq1.a\e0\(dq), none) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT + +import arrays + +const mtag_root = \-1 +const tag_none = \-1 + +// An m\-tag tree is a way to store histories with an O(1) copy operation. +// Histories naturally form a tree, as they have common start and fork at some +// point. The tree is stored as an array of pairs (tag value, link to parent). +// An m\-tag is represented with a single link in the tree (array index). +struct MtagElem { + elem int + pred int +} +type MtagTrie = []MtagElem + +// Append a single value to an m\-tag history. +fn add_mtag(mut trie &MtagTrie, mtag int, value int) int { + trie = arrays.concat(trie, MtagElem{value, mtag}) + return trie.len \- 1 +} + +// Recursively unwind tag histories and collect version components. +fn unwind(trie MtagTrie, x int, y int, str string) []int { + // Reached the root of the m\-tag tree, stop recursion. + if x == mtag_root && y == mtag_root { + return [] + } + + // Unwind history further. + mut result := unwind(trie, trie[x].pred, trie[y].pred, str) + + // Get tag values. Tag histories must have equal length. + if x == mtag_root || y == mtag_root { + panic(\(dqtag histories have different length\(dq) + } + ex := trie[x].elem + ey := trie[y].elem + + if ex != tag_none && ey != tag_none { + // Both tags are valid string indices, extract component. + result = arrays.concat(result, s2n(str[ex..ey])) + } else if !(ex == tag_none && ey == tag_none) { + panic(\(dqboth tags should be tag_none\(dq) + } + return result +} + +fn s2n(s string) int { // convert pre\-parsed string to number + mut n := 0 + for c in s { n = n * 10 + int(c \- 48) } + return n +} + +fn parse(yyinput string) ?[]int { + mut yycursor, mut yymarker := 0, 0 + mut trie := []MtagElem{} + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(aqmut @@ := tag_none\en\(aq; */ + /*!mvars:re2c format = \(dqmut @@ := mtag_root\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(aqmut @@ := tag_none\en\(aq; */ + /*!mtags:re2c format = \(dqmut @@ := mtag_root\en\(dq; */ + + /*!re2c + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:define:YYMTAGP = \(dq@@ = add_mtag(mut &trie, @@, yycursor)\(dq; + re2c:define:YYMTAGN = \(dq@@ = add_mtag(mut &trie, @@, tag_none)\(dq; + + num = [0\-9]+; + + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + mut ver := []int{} + ver = arrays.concat(ver, s2n(yyinput[t1..t2])) + ver = arrays.append(ver, unwind(trie, t3, t4, yyinput)) + return ver + } + * { return none } + */ +} + +fn main() { + test := fn (result ?[]int, expect ?[]int) { + if r := result { + if e := expect { if r != e { panic(\(dqexpected $e, got $r\(dq) } } + } else { + if _ := result { panic(\(dqexpected none\(dq) } + } + } + test(parse(\(dq1\e0\(dq), [1]) + test(parse(\(dq1.2.3.4.5.6.7\e0\(dq), [1, 2, 3, 4, 5, 6, 7]) + test(parse(\(dq1.\e0\(dq), none) +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-\-utf8 \-si + +/*!include:re2c \(dqunicode_categories.re\(dq */ + +fn lex(yyinput string) int { + mut yycursor, mut yymarker := 0, 0 + /*!re2c + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + + identifier { return 0 } + * { return 1 } + */ +} + +fn main() { + if lex(\(dq_Ыдентификатор\e0\(dq) != 0 { + panic(\(dqerror\(dq) + } +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.v) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +enum Result { + ok + fail +} + +/*!re2c + number = [1\-9][0\-9]*; +*/ + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { return .ok } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-i + +/*!include:re2c \(dqdefinitions.v\(dq */ + +fn lex(yyinput string) Result { + mut yycursor, mut yymarker := 0, 0 + /*!re2c + re2c:yyfill:enable = 0; + + * { return .fail } + number { return .ok } + !include \(dqextra_rules.re.inc\(dq; + */ +} + +fn main() { + assert lex(\(dq123\e0\(dq) == .ok + assert lex(\(dq123.4567\e0\(dq) == .ok +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2v $INPUT \-o $OUTPUT \-i \-\-header lexer/state.v +module main + +import lexer // the package is generated by re2c + +/*!header:re2c:on*/ +module lexer + +pub struct State { +pub mut: + yyinput string + yycursor int + /*!stags:re2c format=\(dq@@ int\en\(dq; */ +} +/*!header:re2c:off*/ + +fn lex(mut yyrecord &lexer.State) int { + mut t := 0 + /*!re2c + re2c:header = \(dqlexer/state.v\(dq; + re2c:api = record; + re2c:yyfill:enable = 0; + re2c:tags = 1; + + [a]* @t [b]* { return t } + */ +} + +fn main() { + mut st := &lexer.State{yyinput:\(dqab\e0\(dq,} + if lex(mut st) != 1 { + panic(\(dqerror\(dq) + } +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Code generated by re2c, DO NOT EDIT. + +module lexer + +pub struct State { +pub mut: + yyinput string + yycursor int + +yyt1 int +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/doc/re2zig.1 b/bootstrap/doc/re2zig.1 new file mode 100644 index 000000000..a725e39cc --- /dev/null +++ b/bootstrap/doc/re2zig.1 @@ -0,0 +1,3530 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2C" 1 "" "" +.SH NAME +re2c \- generate fast lexical analyzers for C/C++, Go and Rust +.SH SYNOPSIS +.sp +Note: This manual is for Zig, but it refers to re2c as the general program. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +re2c [ OPTIONS ] [ WARNINGS ] INPUT +re2go [ OPTIONS ] [ WARNINGS ] INPUT +re2rust [ OPTIONS ] [ WARNINGS ] INPUT +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2c works as a preprocessor. It reads the input file (which is usually a +program in the target language, but can be anything) and looks for blocks of +code enclosed in special\-form comments. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2c. It translates them to code in the target language and outputs the +generated code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +fn lex(yyinput: [:0]const u8) bool { + var yycursor: u32 = 0; + %{ + re2c:yyfill:enable = 0; + + number = [1\-9][0\-9]*; + + number { return true; } + * { return false; } + %} +} + +test { + try std.testing.expect(lex(\(dq1234\(dq)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output everything between \fB/*!re2c\fP and \fB*/\fP has been replaced with +the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2zig +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +fn lex(yyinput: [:0]const u8) bool { + var yycursor: u32 = 0; + + var yych: u8 = 0; + var yystate: u32 = 0; + yyl: while (true) { + switch (yystate) { + 0 => { + yych = yyinput[yycursor]; + yycursor += 1; + switch (yych) { + 0x31...0x39 => { + yystate = 2; + continue :yyl; + }, + else => { + yystate = 1; + continue :yyl; + }, + } + }, + 1 => { return false; }, + 2 => { + yych = yyinput[yycursor]; + switch (yych) { + 0x30...0x39 => { + yycursor += 1; + yystate = 2; + continue :yyl; + }, + else => { + yystate = 3; + continue :yyl; + }, + } + }, + 3 => { return true; }, + else => { @panic(\(dqinternal lexer error\(dq); }, + } + } + +} + +test { + try std.testing.expect(lex(\(dq1234\(dq)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SYNTAX +.sp +A re2c program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. There are three main kinds of blocks: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A \fIglobal block\fP contains definitions, configurations, directives and rules. +re2c compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +The \fB:\fP part is optional: if specified, the name can be used to +refer to the block in another part of the program. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A \fIrules block\fP is like a local block, but it does not generate any code and +is meant to be reused in other blocks. This is a way of sharing code +(more details in the \fI\%reusable blocks\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.sp +There are also many auxiliary blocks; see section \fI\%blocks and directives\fP for a +full list of them. A block may contain the following kinds of statements: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2c syntax for regular expressions. Once defined, the +name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2c behavior and customize the +generated code. For a full list of configurations supported by re2c see the +\fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.UNINDENT +.UNINDENT +.SH PROGRAM INTERFACE (API) +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2c does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2zig has three API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBSimple API\fP +This is the default API for the Zig backend. It consists of the following +primitives: \fBYYINPUT\fP (which should be defined as a sequence of code +units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, +\fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). +.nf + +.fi +.sp +.TP +.B \fBRecord API\fP +Record API is useful in cases when lexer state must be stored in a struct. +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the +name can be overridden with \fBre2c:variable:yyrecord\fP) that should be +defined as a struct with fields \fByyinput\fP, \fByycursor\fP, \fByymarker\fP, +\fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code +need to be defined, and their names can be configured). +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +A pointer\-like l\-value that stores the current input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYCURSOR\fP should point to the +first input character. It is advanced by the generated code. +When a rule matches, \fBYYCURSOR\fP points to the position after the +last matched character. It is used only in C pointer API. +.TP +.B \fBYYLIMIT\fP +A pointer\-like r\-value that stores the end of input position (usually a +pointer of type \fBYYCTYPE*\fP). Initially \fBYYLIMIT\fP should point to the +position after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +\fBYYLIMIT\fP is used only in C pointer API. +.TP +.B \fBYYMARKER\fP +A pointer\-like l\-value (usually a pointer of type \fBYYCTYPE*\fP) +that stores the position of the latest matched rule. It is used to +restore the \fBYYCURSOR\fP position if the longer match fails and +the lexer needs to rollback. Initialization is not +needed. \fBYYMARKER\fP is used only in C pointer API. +.TP +.B \fBYYCTXMARKER\fP +A pointer\-like l\-value that stores the position of the trailing context +(usually a pointer of type \fBYYCTYPE*\fP). No initialization is needed. +It is used only in C pointer API, and only with the lookahead operator +\fB/\fP\&. +.TP +.B \fBYYFILL\fP +A generic API primitive with one argument \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +The definition of \fBYYFILL\fP can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYFILL:naked\fP). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with \fB/*!max:re2c*/\fP directive. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one argument \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYPEEK\fP +A generic API primitive with no arguments. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. The definition can be either +function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP). +.TP +.B \fBYYSKIP\fP +A generic API primitive with no arguments. +\fBYYSKIP\fP should advance the current input position by one +character. The definition can be either function\-like or free\-form +depending on the API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUP\fP +A generic API primitive with no arguments. +\fBYYBACKUP\fP should save the current input position, which is +later restored with \fBYYRESTORE\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORE\fP +A generic API primitive with no arguments. +\fBYYRESTORE\fP should restore the current input position to the +value saved by \fBYYBACKUP\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive with zero arguments. +\fBYYBACKUPCTX\fP should save the current input position as the +position of the trailing context, which is later restored by +\fBYYRESTORECTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive with no arguments. +\fBYYRESTORECTX\fP should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYRESTORETAG\fP should restore the trailing context position +to the value of \fBtag\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGP\fP should set \fBtag\fP to the current input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one argument \fBtag\fP, where \fBtag\fP can be a +pointer or an offset (see submatch extraction section for details). +\fBYYSTAGN\fP should to set \fBtag\fP to a value that represents non\-existent +input position. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one argument \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one argument \fBshift\fP\&. +\fBYYSHIFT\fP should shift the current input position by +\fBshift\fP characters (the shift value may be negative). The definition +can be either function\-like or free\-form depending on the API style +(see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTSTAG\fP should shift \fBtag\fP by \fBshift\fP characters +(the shift value may be negative). +The definition can be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two arguments, \fBtag\fP and \fBshift\fP\&. +\fBYYSHIFTMTAG\fP should shift the latest value in the history +of \fBtag\fP by \fBshift\fP characters (the shift value may be negative). +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with \fB/*!maxnmatch:re2c*/\fP directive. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It should be generated either with the \fB/*!types:re2c*/\fP +directive or the \fB\-t\fP \fB\-\-type\-header\fP option. +.TP +.B \fBYYGETCONDITION\fP +An API primitive with zero arguments. +It should be defined as an r\-value of type \fBYYCONDTYPE\fP that is equal to +the current condition identifier. The definition can be either function\-like +or free\-form depending on the API style (see \fBre2c:api:style\fP and +\fBre2c:define:YYGETCONDITION:naked\fP). +.TP +.B \fBYYSETCONDITION\fP +An API primitive with one argument \fBcond\fP\&. +The meaning of \fBYYSETCONDITION\fP is to set the current condition +identifier to \fBcond\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETCONDITION@cond\fP). +.TP +.B \fBYYGETSTATE\fP +An API primitive with zero arguments. +It should be defined as an r\-value of integer type that is equal to the +current lexer state. Should be initialized to \fB\-1\fP\&. The definition can be +either function\-like or free\-form depending on the API style (see +\fBre2c:api:style\fP and \fBre2c:define:YYGETSTATE:naked\fP). +.TP +.B \fBYYSETSTATE\fP +An API primitive with one argument \fBstate\fP\&. +The meaning of \fBYYSETSTATE\fP is to set the current lexer state to +\fBstate\fP\&. +The definition should be either function\-like or free\-form depending on the +API style (see \fBre2c:api:style\fP and \fBre2c:define:YYSETSTATE@state\fP). +.TP +.B \fBYYDEBUG\fP +A debug API primitive with two arguments. It can be used to debug the +generated code (with \fB\-d\fP \fB\-\-debug\-output\fP option). \fBYYDEBUG\fP should +return no value and accept two arguments: \fBstate\fP (either a DFA state +index or \fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.UNINDENT +.SH OPTIONS +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \-\-input \fP +Specify the API used by the generated code to interface with used\-defined +code: \fBdefault\fP is the API based on pointer arithmetic (the default for +C), and \fBcustom\fP is the generic API (the default for Go and Rust). +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2c generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2c blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude:re2c\fP directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2c assumes that the +character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2c treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2c treats Unicode surrogates. +With \fBfail\fP re2c aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2c silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2c treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified with +directives \fBheader:re2c:on\fP and \fBheader:re2c:off\fP\&. +If conditions are used the header will have a condition enum automatically +appended to it (unless there is an explicit \fBconditions:re2c\fP directive). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude:re2c\fP +directive. re2c looks for \fBFILE\fP in the directory of the parent file and +in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2c parses regular expressions. +With \fBascii\fP (the default) re2c handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2c handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the output language. Supported languages are C, Go and Rust. +The default is C for re2c, Go for re2go and Rust for re2rust. +.TP +.B \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Encode DFA in a form of a loop over a switch statement. Individual states +are switch cases. The current state is stored in a variable \fByystate\fP\&. +Transitions between states update \fByystate\fP to the case label of the +destination state and \fBcontinue\fP to the head of the loop. This option is +always enabled for Rust, as it has no \fBgoto\fP statement and cannot use the +goto/label approach which is the default for C and Go backends. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). This option is on by default for Rust, as it does not have +line directives. +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures \-P\fP +Enable submatch extraction with POSIX\-style capturing groups. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2c assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2c assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2c. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SH WARNINGS +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either the \fB\-\-\-header\fP option or the +\fBconditions:re2c\fP directive to generate a mapping of condition names to +numbers and then use the autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2c permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2c silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.UNINDENT +.SH BLOCKS AND DIRECTIVES +.sp +Below is the list of re2c directives (syntactic constructs that mark the +beginning and end of the code that should be processed by re2c). Named blocks +were added in re2c version 2.2. They are exactly the same as unnamed blocks, +except that the name can be used to reference a block in other parts of the +program. More information on each directive can be found in the related +sections. +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP +A global re2c block with an optional name. The block may contain named +definitions, configurations and rules in any order. Named definitions and +configurations are defined in the global scope, so they are inherited by +subsequent blocks. The code for a global block is generated at the point +where the block is specified. +.TP +.B \fB/*!local:re2c[:] ... */\fP +A local re2c block with an optional name. Unlike global blocks, definitions +and configurations inside of a local block are not added into the global +scope. In all other respects local blocks are the same as global blocks. +.TP +.B \fB/*!rules:re2c[:] ... */\fP +A reusable block with an optional name. Rules blocks have the same structure +as local or global blocks, but they do not produce any code and they can be +reused multiple times in other blocks with the help of a \fB!use:;\fP +directive or a \fB/*!use:re2c[:] ... */\fP block. A rules block on its +own does not add any definitions into the global scope. The code for it is +generated at the point of use. Prior to re2c version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP +A use block that references a previously defined rules block. If the name is +specified, re2c looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2c version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXFILL\fP definition. +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXFILL\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP +A directive that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). +An optional list of block names specifies which blocks should be included +when computing \fBYYMAXNMATCH\fP value (if the list is empty, all blocks are +included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP +Directives that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2c. +An optional list of block names specifies which blocks should be included +when computing the set of tag variables (if the list is empty, all blocks +are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP +A directive that generates conditional dispatch on the lexer state (it +requires \fB\-\-storable\-state\fP option). +An optional list of block names specifies which blocks should be included in +the state dispatch. The default transition goes to the start label of the +first block on the list. If the list is empty, all blocks are included, and +the default transition goes to the first block in the file that has a start +label. +This directive is incompatible with the \fB\-\-loop\-switch\fP option and Rust, +as it requires cross\-block transitions that are unsupported without the +\fBgoto\fP statement. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP +A directive that generates condition enumeration (it requires +\fB\-\-conditions\fP option). +An optional list of block names specifies which blocks should be included +when computing the set of conditions (if the list is empty, all blocks are +included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP +This directive allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the directive, in the same way as \fB#include\fP works in C/C++. This +directive can be used together with the \fB\-\-depfile\fP option to generate +build system dependencies on the included files. +.TP +.B \fB!include ;\fP +This directive is the same as \fB/*!include:re2c */\fP, except that it +should be used inside of a re2c block. +.TP +.B \fB/*!header:re2c:on*/\fP +This directive marks the start of header file. Everything after it and up to +the following \fB/*!header:re2c:off*/\fP directive is processed by re2c and +written to the header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP +This directive marks the end of header file started with +\fB/*!header:re2c:on*/\fP\&. +.TP +.B \fB/*!ignore:re2c ... */\fP +A block which contents are ignored and removed from the output file. +.TP +.B \fB%{ ... %}\fP +A global re2c block in the \fB\-\-flex\-support\fP mode. This is deprecated and +exists for backward compatibility. +.UNINDENT +.SH CONFIGURATIONS +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:flags:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:define:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:define:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYCONDTYPE\fP +Defines \fBYYCONDTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTYPE\fP +Defines \fBYYCTYPE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCONDITION\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCONDITION\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCONDITION\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP (see the API primitives +section). +.TP +.B \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP (see the API primitives section). +.TP +.B \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP (see the API primitives section). +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2c generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2c assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, changes the default case in +\fBYYGETSTATE\fP switch: by default it aborts the program, and an explicit +\fB\-1\fP case contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate:re2c\fP directive to generate the +\fBYYGETSTATE\fP switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:variable:yyaccept\fP +Specifies the name of the \fByyaccept\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yybm\fP +Specifies the name of the \fByybm\fP variable (used for bitmaps). +.TP +.B \fBre2c:variable:yybm:hex\fP, \fBre2c:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:variable:yych\fP +Specifies the name of the \fByych\fP variable (see the API primitives +section). +.TP +.B \fBre2c:variable:yych:emit\fP, \fBre2c:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:variable:yych:conversion\fP, \fBre2c:yych:conversion\fP +If set to non\-zero, re2c automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:variable:yyctable\fP +Specifies the name of the \fByyctable\fP variable (the jump table generated +for \fBYYGETCONDITION\fP switch with \fB\-\-computed\-gotos\fP option). +.TP +.B \fBre2c:variable:yytarget\fP +Specifies the name of the \fByytarget\fP variable. +.TP +.B \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:variable:yystate\fP +Specifies the name of the \fByystate\fP variable (used with the +\fB\-\-loop\-switch\fP option to store the current DFA state). +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:define:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SH REGULAR EXPRESSIONS +.sp +re2c uses the following syntax for regular expressions: +.INDENT 0.0 +.IP \(bu 2 +\fB\(dqfoo\(dq\fP case\-sensitive string literal +.IP \(bu 2 +\fB\(aqfoo\(aq\fP case\-insensitive string literal +.IP \(bu 2 +\fB[a\-xyz]\fP, \fB[^a\-xyz]\fP character class (possibly negated) +.IP \(bu 2 +\fB\&.\fP any character except newline +.IP \(bu 2 +\fBR \e S\fP difference of character classes \fBR\fP and \fBS\fP +.IP \(bu 2 +\fBR*\fP zero or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR+\fP one or more occurrences of \fBR\fP +.IP \(bu 2 +\fBR?\fP optional \fBR\fP +.IP \(bu 2 +\fBR{n}\fP repetition of \fBR\fP exactly \fBn\fP times +.IP \(bu 2 +\fBR{n,}\fP repetition of \fBR\fP at least \fBn\fP times +.IP \(bu 2 +\fBR{n,m}\fP repetition of \fBR\fP from \fBn\fP to \fBm\fP times +.IP \(bu 2 +\fB(R)\fP just \fBR\fP; parentheses are used to override precedence. +If submatch extraction is enabled, \fB(R)\fP is a capturing or a +non\-capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.IP \(bu 2 +\fBR S\fP concatenation: \fBR\fP followed by \fBS\fP +.IP \(bu 2 +\fBR | S\fP alternative: \fBR or S\fP +.IP \(bu 2 +\fBR / S\fP lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed +.IP \(bu 2 +\fBname\fP the regular expression defined as \fBname\fP (or literal string +\fB\(dqname\(dq\fP in Flex compatibility mode) +.IP \(bu 2 +\fB{name}\fP the regular expression defined as \fBname\fP in Flex +compatibility mode +.IP \(bu 2 +\fB@stag\fP an \fIs\-tag\fP: saves the last input position at which \fB@stag\fP +matches in a variable named \fBstag\fP +.IP \(bu 2 +\fB#mtag\fP an \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches +in a variable named \fBmtag\fP +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2c supports a +few different methods for handling the end of input. Which one to use depends on +the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +\fB/*!max:re2c*/\fP\&. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +// Expects a null\-terminated string. +fn lex(yyinput: [:0]const u8) i32 { + var yycursor: u32 = 0; + var count: i32 = 0; + + loop: while (true) { + %{ + re2c:yyfill:enable = 0; + + * { return \-1; } + [\ex00] { return count; } + [a\-z]+ { count += 1; continue :loop; } + [ ]+ { continue :loop; } + %} + } +} + +test { + try std.testing.expectEqual(lex(\(dq\(dq), 0); + try std.testing.expectEqual(lex(\(dqone two three\(dq), 3); + try std.testing.expectEqual(lex(\(dqf0ur\(dq), \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +// Expects a null\-terminated string. +fn lex(yyinput: [:0]const u8) i32 { + var yycursor: usize = 0; + var yymarker: usize = 0; + const yylimit: usize = yyinput.len; // points at the terminating null + var count: i32 = 0; + + loop: while (true) { + %{ + re2c:yyfill:enable = 0; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1; } + $ { return count; } + str { count += 1; continue :loop; } + [ ]+ { continue :loop; } + %} + } +} + +test { + try std.testing.expectEqual(lex(\(dq\(dq), 0); + try std.testing.expectEqual(lex(\(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq), 3); + try std.testing.expectEqual(lex(\(dq\(aqunterminated\e\e\(aq\(dq), \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with \fB/*!max:re2c*/\fP\&. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +%{max %} + +fn lex(str: []const u8) !i32 { + // Create a copy of the input string padded with yymaxfill zeroes at the end. + var yyinput = try std.testing.allocator.alloc(u8, str.len + yymaxfill); + defer std.testing.allocator.free(yyinput); + std.mem.copy(u8, yyinput[0..], str); + std.mem.copy(u8, yyinput[str.len..], &[_]u8{0} ** yymaxfill); // zero padding + + var yycursor: usize = 0; + var yylimit: usize = yyinput.len; + var count: i32 = 0; + + loop: while (true) { + %{ + re2c:define:YYFILL = \(dqreturn \-1;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return if (yycursor \- 1 == str.len) count else \-1; + } + str { count += 1; continue :loop; } + [ ]+ { continue :loop; } + * { return \-1; } + %} + } +} + +test { + try std.testing.expectEqual(lex(\(dq\(dq), 0); + try std.testing.expectEqual(lex(\(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq), 3); + try std.testing.expectEqual(lex(\(dq\(aqunterminated\e\e\(aq\(dq), \-1); + try std.testing.expectEqual(lex(\(dq\(aqunexpected \ex00 null\e\e\(aq\(dq), \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +// Expects a string without terminating null. +fn lex(str: []const u8) i32 { + var cur: usize = 0; + var count: i32 = 0; + + loop: while (true) { + %{ + re2c:api = generic; + re2c:yyfill:enable = 0; + // YYPEEK returns \(dqfake\(dq terminating null if cursor has reached limit. + re2c:define:YYPEEK = \(dqif (cur >= str.len) 0 else str[cur]\(dq; + re2c:define:YYSKIP = \(dqcur += 1;\(dq; + + * { return \-1; } + [\ex00] { return count; } + [a\-z]+ { count += 1; continue :loop; } + [ ]+ { continue :loop; } + %} + } +} + +test { + try std.testing.expectEqual(lex(\(dq\(dq), 0); + try std.testing.expectEqual(lex(\(dqone two three\(dq), 3); + try std.testing.expectEqual(lex(\(dqf0ur\(dq), \-1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2c API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fB/*!stags:re2c*/\fP and +\fB/*!mtags:re2c*/\fP directives and +\fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP in generic API) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +const bufsize = 4095; + +const State = struct { + yyinput: [bufsize + 1]u8, + yycursor: usize, + yymarker: usize, + yylimit: usize, + token: usize, + eof: bool +}; + +fn fill(st: *State, file: anytype) i32 { + if (st.eof) { return \-1; } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (st.token < 1) { return \-2; } + + // Shift buffer contents (discard everything up to the current token). + std.mem.copyBackwards( + u8, st.yyinput[0..st.yylimit \- st.token], st.yyinput[st.token..st.yylimit]); + st.yycursor \-= st.token; + st.yymarker = @subWithOverflow(st.yymarker, st.token)[0]; + st.yylimit \-= st.token; + st.token = 0; + + // Fill free space at the end of buffer with new data from file. + st.yylimit += file.read(st.yyinput[st.yylimit..bufsize]) catch 0; + st.yyinput[st.yylimit] = 0; // append sentinel symbol + + // If read less than expected, this is the end of input. + st.eof = st.yylimit < bufsize; + + return 0; +} + +fn lex(yyrecord: *State, file: anytype) i32 { + var count: i32 = 0; + loop: while (true) { + yyrecord.token = yyrecord.yycursor; + %{ + re2c:api = record; + re2c:eof = 0; + re2c:define:YYFILL = \(dqfill(yyrecord, file) == 0\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1; } + $ { return count; } + str { count += 1; continue :loop; } + [ ]+ { continue :loop; } + %} + } +} + +test { + const fname = \(dqinput\(dq; + const content = \(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq ** bufsize; + const count = 3 * bufsize; // number of quoted strings written to file + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + var fw = try std.fs.cwd().createFile(fname, .{}); + try fw.writeAll(content); + fw.close(); + + // Prepare lexer state: all offsets are at the end of buffer. + var fr = try std.fs.cwd().openFile(fname, .{ .mode = .read_only}); + // Normally file would be part of the state struct, but BufferedReader type is unclear. + var br = std.io.bufferedReader(fr.reader()); + var st = State{ + .yyinput = undefined, + .yycursor = bufsize, + .yymarker = bufsize, + .yylimit = bufsize, + .token = bufsize, + .eof = false, + }; + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + st.yyinput[st.yylimit] = 0; + + // Run the lexer. + try std.testing.expectEqual(lex(&st, &br), count); + + // Cleanup: remove input file. + fr.close(); + try std.fs.cwd().deleteFile(fname); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +%{max %} +const bufsize = 4096; + +const State = struct { + yyinput: [bufsize + yymaxfill]u8, + yycursor: usize, + yymarker: usize, + yylimit: usize, + token: usize, + eof: bool +}; + +fn fill(st: *State, need: usize, file: anytype) i32 { + if (st.eof) { return \-1; } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (st.token < need) { return \-2; } + + // Shift buffer contents (discard everything up to the current token). + std.mem.copyBackwards( + u8, st.yyinput[0..st.yylimit \- st.token], st.yyinput[st.token..st.yylimit]); + st.yycursor \-= st.token; + st.yymarker = @subWithOverflow(st.yymarker, st.token)[0]; + st.yylimit \-= st.token; + st.token = 0; + + // Fill free space at the end of buffer with new data from file. + st.yylimit += file.read(st.yyinput[st.yylimit..bufsize]) catch 0; + + // If read less than expected, this is the end of input. + if (st.yylimit < bufsize) { + st.eof = true; + @memset(st.yyinput[st.yylimit..st.yylimit + yymaxfill], 0); + st.yylimit += yymaxfill; + } + + return 0; +} + +fn lex(yyrecord: *State, file: anytype) i32 { + var count: i32 = 0; + loop: while (true) { + yyrecord.token = yyrecord.yycursor; + %{ + re2c:api = record; + re2c:define:YYFILL = \(dq{ if (fill(yyrecord, @@, file) != 0) return \-2; }\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return if (yyrecord.token == yyrecord.yylimit \- yymaxfill) count else \-1; + } + str { count += 1; continue :loop; } + [ ]+ { continue :loop; } + * { return \-1; } + %} + } +} + +test { + const fname = \(dqinput\(dq; + const content = \(dq\(aqqu\ex00tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq ** bufsize; + const count = 3 * bufsize; // number of quoted strings written to file + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + var fw = try std.fs.cwd().createFile(fname, .{}); + try fw.writeAll(content); + fw.close(); + + // Prepare lexer state: all offsets are at the end of buffer. + // This immediately triggers YYFILL, as the YYLESSTHAN condition is true. + var fr = try std.fs.cwd().openFile(fname, .{ .mode = .read_only}); + // Normally file would be part of the state struct, but BufferedReader type is unclear. + var br = std.io.bufferedReader(fr.reader()); + var st = State{ + .yyinput = undefined, + .yycursor = bufsize, + .yymarker = bufsize, + .yylimit = bufsize, + .token = bufsize, + .eof = false, + }; + @memset(st.yyinput[st.yylimit..st.yylimit + yymaxfill], 0); // zero\-padding at the end + + // Run the lexer. + try std.testing.expectEqual(lex(&st, &br), count); + + // Cleanup: remove input file. + fr.close(); + try std.fs.cwd().deleteFile(fname); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH MULTIPLE BLOCKS +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2c blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2c, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +// Store u32 number in u64 during parsing to simplify overflow hadling. +const State = struct { + yyinput: [:0]const u8, + yycursor: usize, + yymarker: usize, + num: u64, +}; + +// Common re2c definitions shared between all functions. +%{ + re2c:api = record; + re2c:variable:yyrecord = st; + re2c:yyfill:enable = 0; +%} + +const ERROR: u64 = @as(u64, std.math.maxInt(u32)) + 1; // overflow + +// Add digit with the given base, checking for overflow. +fn add(st: *State, offs: u8, base: u64) void { + const digit = st.yyinput[st.yycursor \- 1] \- offs; + st.num = @min(st.num * base + digit, ERROR); +} + +// Convert u64 to optional u32 (null meaning overflow or parse error). +fn maybeU32(num: u64) ?u32 { + return if (num < ERROR) @intCast(num) else null; +} + +fn parse_u32(s: [:0]const u8) ?u32 { + var st = State {.yyinput = s, .yycursor = 0, .yymarker = 0, .num = 0}; + %{ + \(aq0b\(aq / [01] { return parse_bin(&st); } + \(dq0\(dq { return parse_oct(&st); } + \(dq\(dq / [1\-9] { return parse_dec(&st); } + \(aq0x\(aq / [0\-9a\-fA\-F] { return parse_hex(&st); } + * { return null; } + %} +} + +fn parse_bin(st: *State) ?u32 { + bin: while (true) {%{ + [01] { add(st, 48, 2); continue :bin; } + * { return maybeU32(st.num); } + %}} +} + +fn parse_oct(st: *State) ?u32 { + oct: while (true) {%{ + [0\-7] { add(st, 48, 8); continue :oct; } + * { return maybeU32(st.num); } + %}} +} + +fn parse_dec(st: *State) ?u32 { + dec: while (true) {%{ + [0\-9] { add(st, 48, 10); continue :dec; } + * { return maybeU32(st.num); } + %}} +} + +fn parse_hex(st: *State) ?u32 { + hex: while (true) {%{ + [0\-9] { add(st, 48, 16); continue :hex; } + [a\-f] { add(st, 87, 16); continue :hex; } + [A\-F] { add(st, 55, 16); continue :hex; } + * { return maybeU32(st.num); } + %}} +} + +test { + try std.testing.expectEqual(parse_u32(\(dq\(dq), null); + try std.testing.expectEqual(parse_u32(\(dq1234567890\(dq), 1234567890); + try std.testing.expectEqual(parse_u32(\(dq0b1101\(dq), 13); + try std.testing.expectEqual(parse_u32(\(dq0x7Fe\(dq), 2046); + try std.testing.expectEqual(parse_u32(\(dq0644\(dq), 420); + try std.testing.expectEqual(parse_u32(\(dq9999999999\(dq), null); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH START CONDITIONS +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2c block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2c. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCONDITION\fP and +\fBYYSETCONDITION\fP primitives to get the current condition or set it to the +given number. Use \fB/*!conditions:re2c*/\fP directive or the \fB\-\-header\fP option +to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2c generates for conditions depends on whether re2c uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2c generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2c reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCONDITION\fP/\fBYYSETCONDITION\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fB/*!conditions:re2c*/\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT \-c + +const std = @import(\(dqstd\(dq); + +%{conditions %} + +const ERROR: u64 = @as(u64, std.math.maxInt(u32)) + 1; // overflow + +// Add digit with the given base, checking for overflow. +fn add(num: *u64, str: [:0]const u8, cur: usize, offs: u8, base: u64) void { + num.* = @min(num.* * base + (str[cur \- 1] \- offs), ERROR); +} + +fn parse_u32(yyinput: [:0]const u8) ?u32 { + var yycursor: usize = 0; + var yymarker: usize = 0; + var yycond = yycinit; + var num: u64 = 0; // Store number in u64 to simplify overflow checks. + + loop: while(true) { + %{ + re2c:yyfill:enable = 0; + + \(aq0b\(aq / [01] :=> bin + \(dq0\(dq :=> oct + \(dq\(dq / [1\-9] :=> dec + \(aq0x\(aq / [0\-9a\-fA\-F] :=> hex + * { return null; } + + [01] { add(&num, yyinput, yycursor, 48, 2); continue :loop; } + [0\-7] { add(&num, yyinput, yycursor, 48, 8); continue :loop; } + [0\-9] { add(&num, yyinput, yycursor, 48, 10); continue :loop; } + [0\-9] { add(&num, yyinput, yycursor, 48, 16); continue :loop; } + [a\-f] { add(&num, yyinput, yycursor, 87, 16); continue :loop; } + [A\-F] { add(&num, yyinput, yycursor, 55, 16); continue :loop; } + + * { + return if (num < ERROR) @intCast(num) else null; + } + %}} +} + +test { + try std.testing.expectEqual(parse_u32(\(dq\(dq), null); + try std.testing.expectEqual(parse_u32(\(dq1234567890\(dq), 1234567890); + try std.testing.expectEqual(parse_u32(\(dq0b1101\(dq), 13); + try std.testing.expectEqual(parse_u32(\(dq0x7Fe\(dq), 2046); + try std.testing.expectEqual(parse_u32(\(dq0644\(dq), 420); + try std.testing.expectEqual(parse_u32(\(dq9999999999\(dq), null); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH STORABLE STATE +.sp +With \fB\-\-storable\-state\fP option re2c generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2c is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate:re2c\fP to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT \-f + +const std = @import(\(dqstd\(dq); + +const Status = enum { + end, + ready, + waiting, + bad_packet, + big_packet +}; + +// Use a small buffer to cover the case when a lexeme doesn\(aqt fit. +// In real world use a larger buffer. +const bufsize = 10; + +const State = struct { + yyinput: [bufsize + 1]u8, + yycursor: usize, + yymarker: usize, + yylimit: usize, + token: usize, + yystate: i32, + received: usize, +}; + +fn fill(st: *State, file: anytype) Status { + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (st.token < 1) { return Status.big_packet; } + + // Shift buffer contents (discard everything up to the current token). + std.mem.copyBackwards( + u8, st.yyinput[0..st.yylimit \- st.token], st.yyinput[st.token..st.yylimit]); + st.yycursor \-= st.token; + st.yymarker = @subWithOverflow(st.yymarker, st.token)[0]; + st.yylimit \-= st.token; + st.token = 0; + + // Fill free space at the end of buffer with new data from file. + st.yylimit += file.read(st.yyinput[st.yylimit..bufsize]) catch 0; + st.yyinput[st.yylimit] = 0; // append sentinel symbol + + return Status.ready; +} + +fn lex(yyrecord: *State) Status { + var yych: u8 = 0; + loop: while (true) { + yyrecord.token = yyrecord.yycursor; + %{ + re2c:api = record; + re2c:eof = 0; + re2c:define:YYFILL = \(dqreturn Status.waiting;\(dq; + + packet = [a\-z]+[;]; + + * { return Status.bad_packet; } + $ { return Status.end; } + packet { yyrecord.received += 1; continue :loop; } + %} + } +} + +fn run(expect: Status, packets: []const []const u8) !void { + // Create a \(dqpipe\(dq (open the same file for reading and writing). + const fname = \(dqinput\(dq; + var fw = try std.fs.cwd().createFile(fname, .{}); + var fr = try std.fs.cwd().openFile(fname, .{ .mode = .read_only}); + + // Initialize lexer state: \(gastate\(ga value is \-1, all offsets are at the end + // of buffer. Normally file would be part of the state, but BufferedReader + // type is unclear. + var br = std.io.bufferedReader(fr.reader()); + var st = State{ + .yyinput = undefined, + .yycursor = bufsize, + .yymarker = bufsize, + .yylimit = bufsize, + .token = bufsize, + .yystate = \-1, + .received = 0, + }; + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + st.yyinput[st.yylimit] = 0; + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + var status = Status.ready; + var send: usize = 0; + while (true) { + status = lex(&st); + if (status == Status.end) { + break; + } else if (status == Status.waiting) { + if (send < packets.len) { + std.log.debug(\(dqsending packet {}\(dq, .{send}); + try fw.writeAll(packets[send]); + send += 1; + } + status = fill(&st, &br); + std.log.debug(\(dqfilled buffer [{s}], status {}\(dq, .{st.yyinput, status}); + if (status != Status.ready) { + break; + } + } else if (status == Status.bad_packet) { + break; + } + } + + // Check results. + try std.testing.expectEqual(status, expect); + if (status == Status.end) { try std.testing.expectEqual(st.received, send); } + + // Cleanup: remove input file. + fw.close(); + fr.close(); + try std.fs.cwd().deleteFile(fname); +} + +test { + try run(Status.end, &[_][]const u8{}); + try run(Status.end, &[_][]const u8{\(dqzero;\(dq, \(dqone;\(dq, \(dqtwo;\(dq, \(dqthree;\(dq, \(dqfour;\(dq}); + try run(Status.bad_packet, &[_][]const u8{\(dq??;\(dq}); + try run(Status.big_packet, &[_][]const u8{\(dqlooooooooooooong;\(dq}); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH REUSABLE BLOCKS +.sp +Reusable blocks are re2c blocks that can be reused any number of times and +combined with other re2c blocks. They are defined with +\fB/*!rules:re2c[:] ... */\fP (the \fB\fP is optional). A rules block +can be used in two contexts: either in a use block, or in a use directive inside +of another block. The code for a rules block is generated at every point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP\&. The \fB\fP +is optional; if not specified, the associated rules block is the most recent one +(whether named or unnamed). A use block can add named definitions, +configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2c +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2c blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2c version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +// in both blocks; the \(aqfish\(aq block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +const std = @import(\(dqstd\(dq); + +const Ans = enum {color, fish, dunno}; + +%{rules:colors + * { @panic(\(dqah\(dq); } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { return Ans.color; } +%} + +%{rules:fish + * { @panic(\(dqoh\(dq); } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { return Ans.fish; } +%} + +fn lex(yyinput: [:0]const u8) Ans { + var yycursor: usize = 0; + var yymarker: usize = 0; + %{ + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { return Ans.dunno; } // overrides inherited \(aq*\(aq rules + %} +} + +test { + try std.testing.expectEqual(lex(\(dqsalmon\(dq), Ans.fish); + try std.testing.expectEqual(lex(\(dqwhat?\(dq), Ans.dunno); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT \-\-input\-encoding utf8 + +// This example supports multiple input encodings: UTF\-8 and UTF\-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding\-specific configurations. + +const std = @import(\(dqstd\(dq); + +%{rules + re2c:yyfill:enable = 0; + + \(dq∀x ∃y\(dq { return yycursor; } + * { return null; } +%} + +fn lex_utf8(yyinput: []const u8) ?usize { + var yycursor: usize = 0; + var yymarker: usize = 0; + %{use + re2c:encoding:utf8 = 1; + re2c:define:YYCTYPE = u8; // the default + %} +} + +fn lex_utf32(yyinput: []const u32) ?usize { + var yycursor: usize = 0; + var yymarker: usize = 0; + %{use + re2c:encoding:utf32 = 1; + re2c:define:YYCTYPE = u32; + %} +} + +test { + const s8 = [_]u8{0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79}; + try std.testing.expectEqual(lex_utf8(&s8), s8.len); + + const s32 = [_]u32{0x2200, 0x78, 0x20, 0x2203, 0x79}; + try std.testing.expectEqual(lex_utf32(&s32), s32.len); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SUBMATCH EXTRACTION +.sp +re2c has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars:re2c\fP and \fBmvars:re2c\fP directives. +If there is more than one way tags can be matched against the input, +ambiguity is resolved using leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. re2c provides a directive \fBmaxnmatch:re2c\fP +that defines \fBYYMAXNMATCH\fP, a constant that equals to the maximum value of +\fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2c generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these with +\fBsvars:re2c\fP directive). Captures with variables support two dismbiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags:re2c\fP or \fBmtags:re2c\fP directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by \fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +const SemVer = struct { + major: u32, + minor: u32, + patch: u32, +}; + +const none = std.math.maxInt(usize); + +fn s2n(str: []const u8) u32 { // convert a pre\-parsed string to a number + var n: u32 = 0; + for (str) |c| { n = n * 10 + (c \- 48); } + return n; +} + +fn parse(yyinput: [:0]const u8) ?SemVer { + var yycursor: usize = 0; + var yymarker: usize = 0; + + // Final tag variables available in semantic action. + %{svars format = \(dqvar @@: usize = none;\(dq; %} + + // Intermediate tag variables used by the lexer (must be autogenerated). + %{stags format = \(dqvar @@: usize = none;\(dq; %} + + %{ + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + return SemVer { + .major = s2n(yyinput[t1..t2]), + .minor = s2n(yyinput[t3..t4]), + .patch = if (t5 == none) 0 else s2n(yyinput[t5..yycursor \- 1]), + }; + } + * { return null; } + %} +} + +test { + try std.testing.expectEqual(parse(\(dq23.34\(dq), SemVer{.major = 23, .minor = 34, .patch = 0}); + try std.testing.expectEqual(parse(\(dq1.2.99999\(dq), SemVer{.major = 1, .minor = 2, .patch = 99999}); + try std.testing.expectEqual(parse(\(dq1.a\(dq), null); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +const bufsize = 4095; +const none = std.math.maxInt(usize); + +const err = error.SyntaxError; + +const SemVer = struct { + major: u32, + minor: u32, + patch: u32, +}; + +fn s2n(str: []const u8) u32 { // convert a pre\-parsed string to a number + var n: u32 = 0; + for (str) |c| { n = n * 10 + (c \- 48); } + return n; +} + +const State = struct { + yyinput: [bufsize + 1]u8, + yycursor: usize, + yymarker: usize, + yylimit: usize, + token: usize, + // Intermediate tag variables must be part of the lexer state passed to YYFILL. + // They don\(aqt correspond to tags and should be autogenerated by re2c. + %{stags format = \(dq@@: usize,\en\(dq; %} + eof: bool +}; + +fn fill(st: *State, file: anytype) i32 { + if (st.eof) { return \-1; } // unexpected EOF + + // Error: lexeme too long. In real life can reallocate a larger buffer. + if (st.token < 1) { return \-2; } + + // Shift buffer contents (discard everything up to the current token). + std.mem.copyBackwards( + u8, st.yyinput[0..st.yylimit \- st.token], st.yyinput[st.token..st.yylimit]); + st.yycursor \-= st.token; + st.yymarker = @subWithOverflow(st.yymarker, st.token)[0]; + st.yylimit \-= st.token; + // Tag variables need to be shifted like other input positions. The check + // for NONE is only needed if some tags are nested inside of alternative or + // repetition, so that they can have NONE value. + %{stags format = \(dqif (st.@@ != none) st.@@ = @subWithOverflow(st.@@, st.token)[0];\en\(dq; %} + st.token = 0; + + // Fill free space at the end of buffer with new data from file. + st.yylimit += file.read(st.yyinput[st.yylimit..bufsize]) catch 0; + st.yyinput[st.yylimit] = 0; // append sentinel symbol + + // If read less than expected, this is the end of input. + st.eof = st.yylimit < bufsize; + + return 0; +} + +fn parse(st: *State, file: anytype) !std.ArrayList(SemVer) { + var vers = std.ArrayList(SemVer).init(std.testing.allocator); + + // Final tag variables available in semantic action. + %{svars format = \(dqvar @@: usize = 0;\en\(dq; %} + + loop: while (true) { + st.token = st.yycursor; + %{ + re2c:api = record; + re2c:eof = 0; + re2c:tags = 1; + re2c:variable:yyrecord = st; + re2c:define:YYFILL = \(dqfill(st, file) == 0\(dq; + + num = [0\-9]+; + + num @t1 \(dq.\(dq @t2 num @t3 (\(dq.\(dq @t4 num)? [\en] { + try vers.append(SemVer { + .major = s2n(st.yyinput[st.token..t1]), + .minor = s2n(st.yyinput[t2..t3]), + .patch = if (t4 == none) 0 else s2n(st.yyinput[t4..st.yycursor \- 1]), + }); + continue :loop; + } + $ { return vers; } + * { return error.SyntaxError; } + %} + } +} + +test { + const fname = \(dqinput\(dq; + const content = \(dq1.22.333\en\(dq ** bufsize; + + // Prepare input file: a few times the size of the buffer, containing + // strings with zeroes and escaped quotes. + var fw = try std.fs.cwd().createFile(fname, .{}); + try fw.writeAll(content); + fw.close(); + + // Prepare lexer state: all offsets are at the end of buffer. + var fr = try std.fs.cwd().openFile(fname, .{ .mode = .read_only}); + // Normally file would be part of the state struct, but BufferedReader type is unclear. + var br = std.io.bufferedReader(fr.reader()); + var st = State{ + .yyinput = undefined, + .yycursor = bufsize, + .yymarker = bufsize, + .yylimit = bufsize, + .token = bufsize, + %{stags format = \(dq.@@ = none,\en\(dq; %} + .eof = false, + }; + // Sentinel at \(gayylimit\(ga offset is set to zero, which triggers YYFILL. + st.yyinput[st.yylimit] = 0; + + // Manually construct expected result. + var expect = std.ArrayList(SemVer).init(std.testing.allocator); + for (0..bufsize) |_| try expect.append(SemVer{.major = 1, .minor = 22, .patch = 333}); + + // Run the lexer. + var result = try parse(&st, &br); + try std.testing.expectEqualDeep(result, expect); + + // Cleanup: free memory and remove input file. + expect.deinit(); + result.deinit(); + fr.close(); + try std.fs.cwd().deleteFile(fname); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +const none = std.math.maxInt(usize); + +const SemVer = struct { + major: u32, + minor: u32, + patch: u32, +}; + +fn s2n(str: []const u8) u32 { // convert pre\-parsed string to a number + var n: u32 = 0; + for (str) |c| { n = n * 10 + (c \- 48); } + return n; +} + +fn parse(yyinput: [:0]const u8) ?SemVer { + var yycursor: usize = 0; + var yymarker: usize = 0; + + // Final tag variables available in semantic action. + %{svars format = \(dqvar @@: usize = none;\(dq; %} + + // Intermediate tag variables used by the lexer (must be autogenerated). + %{stags format = \(dqvar @@: usize = none;\(dq; %} + + %{ + re2c:yyfill:enable = 0; + re2c:captvars = 1; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + return SemVer { + .major = s2n(yyinput[yytl1..yytr1]), + .minor = s2n(yyinput[yytl2..yytr2]), + .patch = if (yytl3 == none) 0 else s2n(yyinput[yytl3 + 1..yytr3]) + }; + } + * { return null; } + %} +} + +test { + try std.testing.expectEqual(parse(\(dq23.34\(dq), SemVer{.major = 23, .minor = 34, .patch = 0}); + try std.testing.expectEqual(parse(\(dq1.2.99999\(dq), SemVer{.major = 1, .minor = 2, .patch = 99999}); + try std.testing.expectEqual(parse(\(dq1.a\(dq), null); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +const none = std.math.maxInt(usize); +const mtag_root = none \- 1; + +const err = error.SyntaxError; + +// An m\-tag tree is a way to store histories with an O(1) copy operation. +// Histories naturally form a tree, as they have common start and fork at some +// point. The tree is stored as an array of pairs (tag value, link to parent). +// An m\-tag is represented with a single link in the tree (array index). +const MtagElem = struct { + elem: usize, // tag value + pred: usize, // index of the predecessor node or root +}; + +// Append a single value to an m\-tag history. +fn add_mtag(trie: *std.ArrayList(MtagElem), mtag: usize, value: usize) !usize { + try trie.append(MtagElem{.elem = value, .pred = mtag}); + return trie.items.len \- 1; +} + +// Recursively unwind tag histories and collect version components. +fn unwind(trie: *std.ArrayList(MtagElem), + x: usize, + y: usize, + str: []const u8, + ver: *std.ArrayList(u32)) !void { + // Reached the root of the m\-tag tree, stop recursion. + if (x == mtag_root and y == mtag_root) return; + + // Unwind history further. + try unwind(trie, trie.items[x].pred, trie.items[y].pred, str, ver); + + // Get tag values. Tag histories must have equal length. + std.debug.assert(x != mtag_root and y != mtag_root); + var ex = trie.items[x].elem; + var ey = trie.items[y].elem; + + if (ex != none and ey != none) { + // Both tags are valid string indices, extract component. + try ver.append(s2n(str[ex..ey])); + } else { + // Both tags are none (this corresponds to zero repetitions). + std.debug.assert(ex == none and ey == none); + } +} + +fn s2n(str: []const u8) u32 { // convert a pre\-parsed string to a number + var n: u32 = 0; + for (str) |c| { n = n * 10 + (c \- 48); } + return n; +} + +fn parse(yyinput: [:0]const u8) !std.ArrayList(u32) { + var yycursor: usize = 0; + var yymarker: usize = 0; + var mt = std.ArrayList(MtagElem).init(std.testing.allocator); + defer mt.deinit(); + + // Final tag variables available in semantic action. + %{svars format = \(dqvar @@: usize = none;\(dq; %} + %{mvars format = \(dqvar @@: usize = mtag_root;\(dq; %} + + // Intermediate tag variables used by the lexer (must be autogenerated). + %{stags format = \(dqvar @@: usize = none;\(dq; %} + %{mtags format = \(dqvar @@: usize = mtag_root;\(dq; %} + + %{ + re2c:define:YYMTAGP = \(dq@@ = add_mtag(&mt, @@, yycursor) catch none;\(dq; + re2c:define:YYMTAGN = \(dq@@ = add_mtag(&mt, @@, none) catch none;\(dq; + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + var ver = std.ArrayList(u32).init(std.testing.allocator); + try ver.append(s2n(yyinput[t1..t2])); + try unwind(&mt, t3, t4, yyinput, &ver); + return ver; + } + * { return error.SyntaxError; } + %} +} + +test { + var result = try parse(\(dq1\(dq); + var expect = std.ArrayList(u32).init(std.testing.allocator); + try expect.appendSlice(&[_]u32{1}); + try std.testing.expectEqualDeep(result, expect); + expect.deinit(); + result.deinit(); +} + +test { + var result = try parse(\(dq1.2.3.4.5.6.7\(dq); + var expect = std.ArrayList(u32).init(std.testing.allocator); + try expect.appendSlice(&[_]u32{1, 2, 3, 4, 5, 6, 7}); + try std.testing.expectEqualDeep(result, expect); + expect.deinit(); + result.deinit(); +} + +test { + var result = parse(\(dq1.2.\(dq) catch null; + try std.testing.expectEqualDeep(result, null); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH ENCODING SUPPORT +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2c compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2c definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2c that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2c handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT \-\-utf8 + +const std = @import(\(dqstd\(dq); + +%{include \(dqunicode_categories.re\(dq %} + +fn lex(yyinput: [:0]const u8) bool { + var yycursor: u32 = 0; + var yymarker: u32 = 0; + + %{ + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + + identifier { return true; } + * { return false; } + %} +} + +test { + try std.testing.expect(lex(\(dq_Ыдентификатор\(dq)); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH INCLUDE FILES +.sp +re2c allows one to include other files using directive \fB/*!include:re2c FILE */\fP +or \fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +The first form should be used outside of re2c blocks, and the second form allows +one to include a file in the middle of a re2c block. re2c looks for included +files in the directory of the including file and in include locations, which +can be specified with \fB\-I\fP option. +Include directives in re2c work in the same way as C/C++ \fB#include\fP: the contents +of \fBFILE\fP are copy\-pasted verbatim in place of the directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2c provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +can be useful to other projects (such as Unicode categories) and form something +like a standard library for re2c. +Below is an example of using include directive. +.SS Include file 1 (definitions.zig) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +const Num = enum { integer, floating, nan }; + +%{ + number = [1\-9][0\-9]*; +%} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { return Num.floating; } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT + +const std = @import(\(dqstd\(dq); + +%{include \(dqdefinitions.zig\(dq %} + +fn lex(yyinput: [:0]const u8) Num { + var yycursor: u32 = 0; + var yymarker: u32 = 0; + %{ + re2c:yyfill:enable = 0; + + * { return Num.nan; } + number { return Num.integer; } + !include \(dqextra_rules.re.inc\(dq; + %} +} + +test { + try std.testing.expectEqual(lex(\(dq123\(dq), Num.integer); + try std.testing.expectEqual(lex(\(dq123.4567\(dq), Num.floating); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH HEADER FILES +.sp +re2c allows one to generate header file from the input \fB\&.re\fP file using option +\fB\-t\fP, \fB\-\-type\-header\fP or configuration \fBre2c:flags:type\-header\fP and +directives \fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP\&. The first directive +marks the beginning of header file, and the second directive marks the end of +it. Everything between these directives is processed by re2c, and the generated +code is written to the file specified by the \fB\-t \-\-type\-header\fP option (or +\fBstdout\fP if this option was not used). Autogenerated header file may be needed +in cases when re2c is used to generate definitions of constants, variables and +structs that must be visible from other translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2zig $INPUT \-o $OUTPUT \-\-header lexer/state.zig + +const std = @import(\(dqstd\(dq); +const state = @import(\(dqlexer/state.zig\(dq); // the module is generated by re2c + +%{header:on %} +pub const State = struct { + yyinput: [:0]const u8, + yycursor: usize, + %{stags format = \(dq@@: usize,\(dq; %} +}; +%{header:off %} + +fn lex(yyrecord: *state.State) usize { + var t: usize = 0; + %{ + re2c:header = \(dqlexer/state.zig\(dq; + re2c:api = record; + re2c:yyfill:enable = 0; + re2c:tags = 1; + + [a]* @t [b]* { return t; } + %} +} + +test { + var st = state.State { + .yyinput = \(dqab\(dq, + .yycursor = 0, + %{stags format = \(dq.@@ = 0,\(dq; %} + }; + try std.testing.expectEqual(lex(&st), 1); +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2c + +pub const State = struct { + yyinput: [:0]const u8, + yycursor: usize, + yyt1: usize, +}; + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SKELETON PROGRAMS +.sp +With the \fB\-S, \-\-skeleton\fP option, re2c ignores all non\-re2c code and generates +a self\-contained C program that can be further compiled and executed. The +program consists of lexer code and input data. For each constructed DFA (block +or condition) re2c generates a standalone lexer and two files: an \fB\&.input\fP +file with strings derived from the DFA and a \fB\&.keys\fP file with expected match +results. The program runs each lexer on the corresponding \fB\&.input\fP file and +compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2c optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2c solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2c takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2c +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2c writes data to file as soon as it is generated). +.SH VISUALIZATION AND DEBUG +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2c does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2c was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +and \fI\%raekye\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/build/split_man.py b/build/split_man.py index 45d4a160b..c0e94cbbf 100644 --- a/build/split_man.py +++ b/build/split_man.py @@ -5,15 +5,18 @@ input manpage. """ +import re import sys -if len(sys.argv) != 4: - print('usage:', sys.argv[0], ' ') +if len(sys.argv) != 3: + print('usage:', sys.argv[0], ' ') exit(1) input = sys.argv[1] output = sys.argv[2] -lang = sys.argv[3].lower().encode('utf-8') + +# Extract language name from the output filename. +lang = re.search('re2([a-z]*)\.1', output).group(1).encode('utf-8') hdr_ext = None disclaimer = None @@ -28,7 +31,8 @@ elif lang == b'go': src_ext = b'go' lang_name = b'Go' -elif lang == b'haskell': +elif lang == b'hs': + lang = b'haskell' src_ext = b'hs' lang_name = b'Haskell' elif lang == b'java': @@ -40,7 +44,8 @@ elif lang == b'ocaml': src_ext = b'ml' lang_name = b'OCaml' -elif lang == b'python': +elif lang == b'py': + lang = b'python' src_ext = b'py' lang_name = b'Python' elif lang == b'rust': diff --git a/cmake/Re2cGenDocs.cmake b/cmake/Re2cGenDocs.cmake index e046b55b7..b6e7cb3cd 100644 --- a/cmake/Re2cGenDocs.cmake +++ b/cmake/Re2cGenDocs.cmake @@ -1,19 +1,21 @@ -function(re2c_gen_manpage source target bootstrap lang) +function(re2c_gen_manpage source target) if(RE2C_REBUILD_DOCS) get_filename_component(targetdir "${target}" DIRECTORY) - set(source_l "${source}.${lang}") + set(split_source "${target}.rst") + file(RELATIVE_PATH relative_target "${CMAKE_CURRENT_BINARY_DIR}" "${target}") + set(bootstrap "${CMAKE_CURRENT_SOURCE_DIR}/bootstrap/${relative_target}") add_custom_command( OUTPUT "${target}" COMMAND "${CMAKE_COMMAND}" -E make_directory ${targetdir} - COMMAND "${PYTHON}" "${re2c_splitman}" "${source}" "${source_l}" "${lang}" - COMMAND "${PYTHON}" "${re2c_rst2man}" --tab-width=4 "${source_l}" "${target}" + COMMAND "${PYTHON}" "${re2c_splitman}" "${source}" "${split_source}" + COMMAND "${PYTHON}" "${re2c_rst2man}" --tab-width=4 "${split_source}" "${target}" COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${target}" "${bootstrap}" DEPENDS "${source}" "${re2c_splitman}" "${re2c_rst2man}" "${re2c_docs_sources}" - BYPRODUCTS "${source_l}" + BYPRODUCTS "${split_source}" ) else() add_custom_command( diff --git a/doc/manpage.rst.in b/doc/manpage.rst.in index 329edd220..cf2c577ee 100644 --- a/doc/manpage.rst.in +++ b/doc/manpage.rst.in @@ -159,7 +159,7 @@ Submatch extraction :literal: :code: RE2C_LANG .. include:: @top_srcdir@/doc/manual/submatch/submatch_example_captures.rst_ -.. include:: @top_srcdir@/examples/RE2C_LANG/submatch/04_posix_captures.re +.. include:: @top_srcdir@/examples/RE2C_LANG/submatch/03_captures.re :literal: :code: RE2C_LANG .. include:: @top_srcdir@/doc/manual/submatch/submatch_example_mtags.rst_ diff --git a/doc/manual/api/api2_c.rst_ b/doc/manual/api/api2_c.rst_ index 6e772dd16..4dc14eb31 100644 --- a/doc/manual/api/api2_c.rst_ +++ b/doc/manual/api/api2_c.rst_ @@ -12,8 +12,8 @@ program: **Record API** (*added in version 4.0*) Record API is useful in cases when lexer state must be stored in a struct. - It is enabled with option ``--api record`` or configuration - ``re2c:api = record``. This API consists of a variable ``yyrecord`` (the + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the name can be overridden with ``re2c:variable:yyrecord``) that should be defined as a struct with fields ``yycursor``, ``yymarker``, ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code need to be defined, @@ -23,8 +23,8 @@ program: **Generic API** (*added in version 0.14*) - This is the most flexible API provided by re2c. It is enabled with - ``--api generic`` option or ``re2c:api = generic`` configuration. + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. This API contains primitives for generic operations: ``YYPEEK``, ``YYSKIP``, diff --git a/doc/manual/api/api2_d.rst_ b/doc/manual/api/api2_d.rst_ new file mode 100644 index 000000000..c32622908 --- /dev/null +++ b/doc/manual/api/api2_d.rst_ @@ -0,0 +1,41 @@ + +re2d has three API flavors that define the core set of primitives used by a +program: + +**Simple API** + This is the default API for D backend. It consists of primitives + ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER`` and ``YYLIMIT``, which + should be defined as pointers of type ``YYCTYPE*``. + + | + +**Record API** + Record API is useful in cases when lexer state must be stored in a struct. + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the + name can be overridden with ``re2c:variable:yyrecord``) that should be + defined as a struct with fields ``yycursor``, ``yymarker``, ``yyctxmarker``, + ``yylimit`` (only the fields used by the generated code need to be defined, + and their names can be configured). + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_go.rst_ b/doc/manual/api/api2_go.rst_ index 108d3c780..60a820130 100644 --- a/doc/manual/api/api2_go.rst_ +++ b/doc/manual/api/api2_go.rst_ @@ -4,8 +4,8 @@ program: **Simple API** (*added in version 4.0*) - This is a basic API that can be enabled with option ``--api simple`` or - configuration ``re2c:api = simple``. It consists of the following + This is a basic API that can be enabled with ``--api simple`` option or + ``re2c:api = simple`` configuration. It consists of the following primitives: ``YYINPUT`` (which should be defined as a sequence of code units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). @@ -15,8 +15,8 @@ program: **Record API** (*added in version 4.0*) Record API is useful in cases when lexer state must be stored in a struct. - It is enabled with option ``--api record`` or configuration - ``re2c:api = record``. This API consists of a variable ``yyrecord`` (the + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the name can be overridden with ``re2c:variable:yyrecord``) that should be defined as a struct with fields ``yyinput``, ``yycursor``, ``yymarker``, ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code @@ -25,10 +25,8 @@ program: | **Generic API** - (*added in version 0.14*) - This is the default API for the Go backend. It is enabled with - ``--api generic`` option or ``re2c:api = generic`` configuration. - This API contains primitives for generic operations: + This is the most flexible API and the default API for the Go backend. + It contains primitives for generic operations: ``YYPEEK``, ``YYSKIP``, ``YYBACKUP``, diff --git a/doc/manual/api/api2_haskell.rst_ b/doc/manual/api/api2_haskell.rst_ new file mode 100644 index 000000000..35d3a1581 --- /dev/null +++ b/doc/manual/api/api2_haskell.rst_ @@ -0,0 +1,35 @@ + +re2hs has two API flavors that define the core set of primitives used by a +program: + +**Record API** + Record API is the default API for the Haskell backend. + This API consists of a binding ``yyrecord`` (the name can be overridden with + ``re2c:variable:yyrecord``) that should be defined as a record with fields + ``_yyinput``, ``_yycursor``, ``_yymarker``, ``_yyctxmarker``, ``_yylimit``. + Only the fields used by the generated code need to be defined, and their + names can be configured. + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYCOPYSTAG``, + ``YYCOPYMTAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_java.rst_ b/doc/manual/api/api2_java.rst_ new file mode 100644 index 000000000..77da3d606 --- /dev/null +++ b/doc/manual/api/api2_java.rst_ @@ -0,0 +1,42 @@ + +re2java has three API flavors that define the core set of primitives used by a +program: + +**Simple API** + This is the default API for the Java backend. It consists of the following + primitives: ``YYINPUT`` (which should be defined as a sequence of code + units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, + ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). + + | + +**Record API** + Record API is useful in cases when lexer state must be stored in a class. + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the + name can be overridden with ``re2c:variable:yyrecord``) that should be + defined as a class with fields ``yyinput``, ``yycursor``, ``yymarker``, + ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code + need to be defined, and their names can be configured). + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_js.rst_ b/doc/manual/api/api2_js.rst_ new file mode 100644 index 000000000..48bf0cbfe --- /dev/null +++ b/doc/manual/api/api2_js.rst_ @@ -0,0 +1,42 @@ + +re2js has three API flavors that define the core set of primitives used by a +program: + +**Simple API** + This is the default API for the JavaScript backend. It consists of the + following primitives: ``YYINPUT`` (which should be defined as a sequence of + code units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, + ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). + + | + +**Record API** + Record API is useful in cases when lexer state must be stored in an object. + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the + name can be overridden with ``re2c:variable:yyrecord``) that should be + defined as an object with properties ``yyinput``, ``yycursor``, + ``yymarker``, ``yyctxmarker``, ``yylimit`` (only the fields used by the + generated code need to be defined, and their names can be configured). + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_ocaml.rst_ b/doc/manual/api/api2_ocaml.rst_ new file mode 100644 index 000000000..07dbd7f92 --- /dev/null +++ b/doc/manual/api/api2_ocaml.rst_ @@ -0,0 +1,33 @@ + +re2ocaml has two API flavors that define the core set of primitives used by a +program: + +**Record API** + Record API is the default API for the OCaml backend. + This API consists of a variable ``yyrecord`` (the name can be overridden with + ``re2c:variable:yyrecord``) that should be defined as a record with fields + ``_yyinput``, ``_yycursor``, ``_yymarker``, ``_yyctxmarker``, ``_yylimit``. + Only the fields used by the generated code need to be defined, and their + names can be configured. + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_python.rst_ b/doc/manual/api/api2_python.rst_ new file mode 100644 index 000000000..cd848f01c --- /dev/null +++ b/doc/manual/api/api2_python.rst_ @@ -0,0 +1,42 @@ + +re2py has three API flavors that define the core set of primitives used by a +program: + +**Simple API** + This is the default API for the Python backend. It consists of the + following primitives: ``YYINPUT`` (which should be defined as a sequence of + code units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, + ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). + + | + +**Record API** + Record API is useful in cases when lexer state must be stored in a class. + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the + name can be overridden with ``re2c:variable:yyrecord``) that should be + defined as a class with attributes ``yyinput``, ``yycursor``, ``yymarker``, + ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code + need to be defined, and their names can be configured). + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_rust.rst_ b/doc/manual/api/api2_rust.rst_ index a8a44efcf..6ad697aa6 100644 --- a/doc/manual/api/api2_rust.rst_ +++ b/doc/manual/api/api2_rust.rst_ @@ -4,8 +4,8 @@ program: **Simple API** (*added in version 4.0*) - This is a basic API that can be enabled with option ``--api simple`` or - configuration ``re2c:api = simple``. It consists of the following + This is a basic API that can be enabled with ``--api simple`` option or + ``re2c:api = simple`` configuration. It consists of the following primitives: ``YYINPUT`` (which should be defined as a sequence of code units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). @@ -15,8 +15,8 @@ program: **Record API** (*added in version 4.0*) Record API is useful in cases when lexer state must be stored in a struct. - It is enabled with option ``--api record`` or configuration - ``re2c:api = record``. This API consists of a variable ``yyrecord`` (the + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the name can be overridden with ``re2c:variable:yyrecord``) that should be defined as a struct with fields ``yyinput``, ``yycursor``, ``yymarker``, ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code @@ -25,9 +25,7 @@ program: | **Generic API** - (*added in version 0.14*) - This is the default API for the Rust backend. It is enabled with - ``--api generic`` option or ``re2c:api = generic`` configuration. + This is the most flexible API and the default API for the Rust backend. This API contains primitives for generic operations: ``YYPEEK``, ``YYSKIP``, diff --git a/doc/manual/api/api2_v.rst_ b/doc/manual/api/api2_v.rst_ new file mode 100644 index 000000000..81e167e09 --- /dev/null +++ b/doc/manual/api/api2_v.rst_ @@ -0,0 +1,42 @@ + +re2v has three API flavors that define the core set of primitives used by a +program: + +**Simple API** + This is the default API for the V backend. It consists of the following + primitives: ``YYINPUT`` (which should be defined as a sequence of code + units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, + ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). + + | + +**Record API** + Record API is useful in cases when lexer state must be stored in a struct. + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the + name can be overridden with ``re2c:variable:yyrecord``) that should be + defined as a struct with fields ``yyinput``, ``yycursor``, ``yymarker``, + ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code + need to be defined, and their names can be configured). + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/api/api2_zig.rst_ b/doc/manual/api/api2_zig.rst_ new file mode 100644 index 000000000..83f3d2b87 --- /dev/null +++ b/doc/manual/api/api2_zig.rst_ @@ -0,0 +1,42 @@ + +re2zig has three API flavors that define the core set of primitives used by a +program: + +**Simple API** + This is the default API for the Zig backend. It consists of the following + primitives: ``YYINPUT`` (which should be defined as a sequence of code + units, e.g. a string) and ``YYCURSOR``, ``YYMARKER``, ``YYCTXMARKER``, + ``YYLIMIT`` (which should be defined as indices in ``YYINPUT``). + + | + +**Record API** + Record API is useful in cases when lexer state must be stored in a struct. + It is enabled with ``--api record`` option or ``re2c:api = record`` + configuration. This API consists of a variable ``yyrecord`` (the + name can be overridden with ``re2c:variable:yyrecord``) that should be + defined as a struct with fields ``yyinput``, ``yycursor``, ``yymarker``, + ``yyctxmarker``, ``yylimit`` (only the fields used by the generated code + need to be defined, and their names can be configured). + + | + +**Generic API** + This is the most flexible API. It is enabled with ``--api generic`` option + or ``re2c:api = generic`` configuration. + It contains primitives for generic operations: + ``YYPEEK``, + ``YYSKIP``, + ``YYBACKUP``, + ``YYBACKUPCTX``, + ``YYSTAGP``, + ``YYSTAGN``, + ``YYMTAGP``, + ``YYMTAGN``, + ``YYRESTORE``, + ``YYRESTORECTX``, + ``YYRESTORETAG``, + ``YYSHIFT``, + ``YYSHIFTSTAG``, + ``YYSHIFTMTAG``, + ``YYLESSTHAN``. diff --git a/doc/manual/submatch/submatch.rst_ b/doc/manual/submatch/submatch.rst_ index 14f181ee1..a886ac655 100644 --- a/doc/manual/submatch/submatch.rst_ +++ b/doc/manual/submatch/submatch.rst_ @@ -1,51 +1,60 @@ re2c has two options for submatch extraction. -The first option is ``-T --tags``. With this option one can use standalone tags -of the form ``@stag`` and ``#mtag``, where ``stag`` and ``mtag`` are arbitrary -used-defined names. Tags can be used anywhere inside of a regular expression; -semantically they are just position markers. Tags of the form ``@stag`` are -called s-tags: they denote a single submatch value (the last input position -where this tag matched). Tags of the form ``#mtag`` are called m-tags: they -denote multiple submatch values (the whole history of repetitions of this tag). -All tags should be defined by the user as variables with the corresponding -names. With standalone tags re2c uses leftmost greedy disambiguation: submatch -positions correspond to the leftmost matching path through the regular -expression. +**Tags** + The first option is to use standalone *tags* of the form ``@stag`` or + ``#mtag``, where ``stag`` and ``mtag`` are arbitrary used-defined names. + Tags are enabled with ``-T --tags`` option or ``re2c:tags = 1`` + configuration. Semantically tags are position markers: they can be + inserted anywhere in a regular expression, and they bind to the + corresponding position (or multiple positions) in the input string. + *S-tags* bind to the last matching position, and *m-tags* bind to a list of + positions (they may be used in repetition subexpressions, where a single + position in a regular expression corresponds to multiple positions in the + input string). All tags should be defined by the user, either manually or + with the help of ``svars:re2c`` and ``mvars:re2c`` directives. + If there is more than one way tags can be matched against the input, + ambiguity is resolved using leftmost greedy disambiguation strategy. -The second option is ``-P --posix-captures``: it enables POSIX-compliant -capturing groups. In this mode parentheses in regular expressions denote the -beginning and the end of capturing groups; the whole regular expression is group -number zero. The number of groups for the matching rule is stored in a variable -``yynmatch``, and submatch results are stored in ``yypmatch`` array. Both -``yynmatch`` and ``yypmatch`` should be defined by the user, and ``yypmatch`` -size must be at least ``[yynmatch * 2]``. re2c provides a directive -``/*!maxnmatch:re2c*/`` that defines ``YYMAXNMATCH``: a constant equal to the -maximal value of ``yynmatch`` among all rules. Note that re2c implements -POSIX-compliant disambiguation: each subexpression matches as long as possible, -and subexpressions that start earlier in regular expression have priority over -those starting later. Capturing groups are translated into s-tags under the -hood, therefore we use the word "tag" to describe them as well. +**Captures** + The second option is to use *capturing groups*. They are enabled with + ``--captures`` option or ``re2c:captures = 1`` configuration. There are two + flavours for different disambiguation policies, ``--leftmost-captures`` + (the default) is for leftmost greedy policy, and, ``--posix-captures`` is + for POSIX longest-match policy. In this mode all parenthesized + subexpressions are considered capturing groups, and a bang can be used to + mark non-capturing groups: ``(! ... )``. With ``--invert-captures`` option or + ``re2c:invert-captures = 1`` configuration the meaning of bang is inverted. + The number of groups for the matching rule is stored in a variable + ``yynmatch`` (the whole regular expression is group number zero), and + submatch results are stored in ``yypmatch`` array. Both ``yynmatch`` and + ``yypmatch`` should be defined by the user, and ``yypmatch`` size must be at + least ``[yynmatch * 2]``. re2c provides a directive ``maxnmatch:re2c`` + that defines ``YYMAXNMATCH``, a constant that equals to the maximum value of + ``yynmatch`` among all rules. -With both ``-P --posix-captures`` and ``T --tags`` options re2c uses efficient -submatch extraction algorithm described in the -`Tagged Deterministic Finite Automata with Lookahead `_ -paper. The overhead on submatch extraction in the generated lexer grows with the -number of tags --- if this number is moderate, the overhead is barely -noticeable. In the lexer tags are implemented using a number of tag variables -generated by re2c. There is no one-to-one correspondence between tag variables -and tags: a single variable may be reused for different tags, and one tag may -require multiple variables to hold all its ambiguous values. Eventually -ambiguity is resolved, and only one final variable per tag survives. When a rule -matches, all its tags are set to the values of the corresponding tag variables. -The exact number of tag variables is unknown to the user; this number is -determined by re2c. However, tag variables should be defined by the user as a -part of the lexer state and updated by ``YYFILL``, therefore re2c provides -directives ``/*!stags:re2c*/`` and ``/*!mtags:re2c*/`` that can be used to -declare, initialize and manipulate tag variables. These directives have two -optional configurations: ``format = "@@";`` (specifies the template where ``@@`` -is substituted with the name of each tag variable), and ``separator = "";`` -(specifies the piece of code used to join the generated pieces for different -tag variables). +**Captvars** + Another way to use capturing groups is the ``--captvars`` option or + ``re2c:captvars = 1`` configuration. The only difference with ``--captures`` + is in the way the generated code stores submatch results: instead of + ``yynmatch`` and ``yypmatch`` re2c generates variables ``yytl`` and + ``yytr`` for *k*-th capturing group (the user should declare these with + ``svars:re2c`` directive). Captures with variables support two dismbiguation + policies: ``--leftmost-captvars`` or ``re2c:leftmost-captvars = 1`` for + leftmost greedy policy (the default one) and ``--posix-captvars`` or + ``re2c:posix-captvars`` for POSIX longest-match policy. + +Under the hood all these options translate into tags and +`Tagged Deterministic Finite Automata with Lookahead `_. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there're no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of *tag variables* which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using ``stags:re2c`` or ``mtags:re2c`` directives. +If the lexer state is stored, tag variables should be part of it. They also +need to be updated by ``YYFILL``. S-tags support the following operations: diff --git a/doc/manual/submatch/submatch_example_captures.rst_ b/doc/manual/submatch/submatch_example_captures.rst_ index 09b24f5e5..3bb3f7e8b 100644 --- a/doc/manual/submatch/submatch_example_captures.rst_ +++ b/doc/manual/submatch/submatch_example_captures.rst_ @@ -1,2 +1,2 @@ -Here is an example of using POSIX capturing groups to parse semantic versions. +Here is an example of using capturing groups to parse semantic versions.