diff --git a/.bazelrc b/.bazelrc index c8ff98fa2..0311dd3e0 100644 --- a/.bazelrc +++ b/.bazelrc @@ -8,10 +8,6 @@ build --features=layering_check build --features=parse_headers # Abseil requires C++14 at minimum. -# Previously, the flag was set via `BAZEL_CXXOPTS`. On macOS, we also had to set -# `BAZEL_USE_CPP_ONLY_TOOLCHAIN` since Bazel wouldn't respect the former without -# the latter. However, the latter stopped Bazel from using Xcode and `-framework -# Foundation`, which CCTZ (vendored into Abseil) requires. build --enable_platform_specific_config build:linux --cxxopt=-std=c++14 build:macos --cxxopt=-std=c++14 @@ -19,3 +15,6 @@ build:windows --cxxopt=/std:c++14 # Print test logs for failed tests. test --test_output=errors + +# https://bazel.build/configure/best-practices#bazelrc-file +try-import %workspace%/user.bazelrc diff --git a/.github/bazel.sh b/.github/bazel.sh index 7295ec6a8..1fe309fab 100755 --- a/.github/bazel.sh +++ b/.github/bazel.sh @@ -1,24 +1,25 @@ #!/bin/bash set -eux -bazel clean -bazel build --compilation_mode=dbg -- //:all -bazel test --compilation_mode=dbg -- //:all \ - -//:dfa_test \ - -//:exhaustive1_test \ - -//:exhaustive2_test \ - -//:exhaustive3_test \ - -//:exhaustive_test \ - -//:random_test +# Disable MSYS/MSYS2 path conversion, which interferes with Bazel. +export MSYS_NO_PATHCONV='1' +export MSYS2_ARG_CONV_EXCL='*' -bazel clean -bazel build --compilation_mode=opt -- //:all -bazel test --compilation_mode=opt -- //:all \ - -//:dfa_test \ - -//:exhaustive1_test \ - -//:exhaustive2_test \ - -//:exhaustive3_test \ - -//:exhaustive_test \ - -//:random_test +for compilation_mode in dbg opt +do + bazel clean + bazel build --compilation_mode=${compilation_mode} -- \ + //:re2 \ + //python:re2 + bazel test --compilation_mode=${compilation_mode} -- \ + //:all \ + -//:dfa_test \ + -//:exhaustive1_test \ + -//:exhaustive2_test \ + -//:exhaustive3_test \ + -//:exhaustive_test \ + -//:random_test \ + //python:all +done exit 0 diff --git a/.github/cmake.sh b/.github/cmake.sh index 782334e81..5e42d3703 100755 --- a/.github/cmake.sh +++ b/.github/cmake.sh @@ -1,12 +1,11 @@ #!/bin/bash set -eux -cmake . -D CMAKE_BUILD_TYPE=Debug -D RE2_BUILD_TESTING=ON "$@" -cmake --build . --config Debug --clean-first -ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random' - -cmake . -D CMAKE_BUILD_TYPE=Release -D RE2_BUILD_TESTING=ON "$@" -cmake --build . --config Release --clean-first -ctest -C Release --output-on-failure -E 'dfa|exhaustive|random' +for CMAKE_BUILD_TYPE in Debug Release +do + cmake . -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -D RE2_BUILD_TESTING=ON "$@" + cmake --build . --config ${CMAKE_BUILD_TYPE} --clean-first + ctest -C ${CMAKE_BUILD_TYPE} --output-on-failure -E 'dfa|exhaustive|random' +done exit 0 diff --git a/.github/workflows/ci-bazel.yml b/.github/workflows/ci-bazel.yml index 174ed68a9..eb4657049 100644 --- a/.github/workflows/ci-bazel.yml +++ b/.github/workflows/ci-bazel.yml @@ -15,6 +15,8 @@ jobs: BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v4 - - uses: bazelbuild/setup-bazelisk@v3 + - uses: bazel-contrib/setup-bazel@0.8.0 + with: + bazelisk-version: '1.x' - run: .github/bazel.sh shell: bash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 445da78d6..a7107db8b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,7 +34,7 @@ jobs: strategy: fail-fast: false matrix: - ver: [15, 16, 17] + ver: [16, 17, 18] env: CC: clang-${{ matrix.ver }} CXX: clang++-${{ matrix.ver }} diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 0810aa8bd..7736a263a 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -18,7 +18,9 @@ jobs: USER: runner steps: - uses: actions/checkout@v4 - - uses: bazelbuild/setup-bazelisk@v3 + - uses: bazel-contrib/setup-bazel@0.8.0 + with: + bazelisk-version: '1.x' - run: app/build.sh shell: bash - uses: actions/upload-pages-artifact@v3 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c69081bda..ba7841ec8 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -35,7 +35,9 @@ jobs: # Stash the timestamp for the commit SHA that triggered the workflow. - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" shell: bash - - uses: bazelbuild/setup-bazelisk@v3 + - uses: bazel-contrib/setup-bazel@0.8.0 + with: + bazelisk-version: '1.x' - name: Prepare Python ${{ matrix.ver }} environment run: | "${PYTHON}" -m pip install --upgrade pip @@ -86,7 +88,9 @@ jobs: # Stash the timestamp for the commit SHA that triggered the workflow. - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" shell: bash - - uses: bazelbuild/setup-bazelisk@v3 + - uses: bazel-contrib/setup-bazel@0.8.0 + with: + bazelisk-version: '1.x' - uses: actions/setup-python@v5 with: python-version: ${{ matrix.ver }} @@ -135,12 +139,9 @@ jobs: # Stash the timestamp for the commit SHA that triggered the workflow. - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" shell: bash - # Avoid the Chocolatey install of Bazel getting in the way; - # `bazelbuild/setup-bazelisk` doesn't work for some reason. - - run: | - choco uninstall -y bazel - choco install -y bazelisk - shell: bash + - uses: bazel-contrib/setup-bazel@0.8.0 + with: + bazelisk-version: '1.x' # Lowercase the architecture name for `actions/setup-python`. - run: | ARCHITECTURE=${{ matrix.arch.name }} diff --git a/.gitignore b/.gitignore index a671fe2cf..56f0a3153 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ core obj/ benchlog.* +user.bazelrc diff --git a/BUILD.bazel b/BUILD.bazel index ffe56c0c5..169c4d754 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -76,17 +76,17 @@ cc_library( }), visibility = ["//visibility:public"], deps = [ - "@com_google_absl//absl/base", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/container:fixed_array", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/container:inlined_vector", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/synchronization", - "@com_google_absl//absl/types:optional", - "@com_google_absl//absl/types:span", + "@abseil-cpp//absl/base", + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/container:fixed_array", + "@abseil-cpp//absl/container:flat_hash_map", + "@abseil-cpp//absl/container:flat_hash_set", + "@abseil-cpp//absl/container:inlined_vector", + "@abseil-cpp//absl/strings", + "@abseil-cpp//absl/strings:str_format", + "@abseil-cpp//absl/synchronization", + "@abseil-cpp//absl/types:optional", + "@abseil-cpp//absl/types:span", ], ) @@ -130,11 +130,11 @@ cc_library( visibility = [":__subpackages__"], deps = [ ":re2", - "@com_google_absl//absl/base", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", + "@abseil-cpp//absl/base", + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/strings", + "@abseil-cpp//absl/strings:str_format", "@googletest//:gtest", ], ) @@ -145,8 +145,8 @@ cc_test( srcs = ["re2/testing/charclass_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/strings:str_format", + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/strings:str_format", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -158,7 +158,7 @@ cc_test( srcs = ["re2/testing/compile_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -171,7 +171,7 @@ cc_test( deps = [ ":re2", ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -183,7 +183,7 @@ cc_test( srcs = ["re2/testing/mimics_pcre_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -195,7 +195,7 @@ cc_test( srcs = ["re2/testing/parse_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -208,8 +208,8 @@ cc_test( deps = [ ":re2", ":testing", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/strings", + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/strings", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -222,7 +222,7 @@ cc_test( deps = [ ":re2", ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -235,8 +235,8 @@ cc_test( deps = [ ":re2", ":testing", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/strings:str_format", + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/strings:str_format", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -259,7 +259,7 @@ cc_test( srcs = ["re2/testing/required_prefix_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -271,7 +271,7 @@ cc_test( srcs = ["re2/testing/search_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -295,7 +295,7 @@ cc_test( srcs = ["re2/testing/simplify_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/base:core_headers", + "@abseil-cpp//absl/base:core_headers", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -319,9 +319,9 @@ cc_test( deps = [ ":re2", ":testing", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/strings:str_format", + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/strings:str_format", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -377,8 +377,8 @@ cc_test( srcs = ["re2/testing/random_test.cc"], deps = [ ":testing", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/strings:str_format", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/strings:str_format", "@googletest//:gtest", "@googletest//:gtest_main", ], @@ -391,10 +391,10 @@ cc_binary( deps = [ ":re2", ":testing", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/flags:flag", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/synchronization", + "@abseil-cpp//absl/container:flat_hash_map", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/strings:str_format", + "@abseil-cpp//absl/synchronization", "@google_benchmark//:benchmark_main", ], ) diff --git a/MODULE.bazel b/MODULE.bazel index f99fcd259..423572761 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -6,26 +6,22 @@ module( name = "re2", - version = "2024-02-01", + version = "2024-03-01", compatibility_level = 1, ) bazel_dep(name = "platforms", version = "0.0.8") -bazel_dep(name = "apple_support", version = "1.11.1", repo_name = "build_bazel_apple_support") +bazel_dep(name = "apple_support", version = "1.14.0") bazel_dep(name = "rules_cc", version = "0.0.9") -bazel_dep(name = "abseil-cpp", version = "20240116.0", repo_name = "com_google_absl") -bazel_dep(name = "rules_python", version = "0.29.0") -bazel_dep(name = "pybind11_bazel", version = "2.11.1.bzl.1") +bazel_dep(name = "abseil-cpp", version = "20240116.1") +bazel_dep(name = "rules_python", version = "0.31.0") +bazel_dep(name = "pybind11_bazel", version = "2.11.1.bzl.3") # This is a temporary hack for `x64_x86_windows`. # TODO(junyer): Remove whenever no longer needed. cc_configure = use_extension("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_configure_extension") use_repo(cc_configure, "local_config_cc") -python_configure = use_extension("@pybind11_bazel//:python_configure.bzl", "extension") -python_configure.toolchain(python_version = "3") # ignored when non-root module -use_repo(python_configure, "local_config_python", "pybind11") - # These dependencies will be ignored when the `re2` module is not # the root module (or when `--ignore_dev_dependency` is enabled). bazel_dep(name = "google_benchmark", version = "1.8.3", dev_dependency = True) diff --git a/python/BUILD.bazel b/python/BUILD.bazel index 48d7d3f58..74b7339a5 100644 --- a/python/BUILD.bazel +++ b/python/BUILD.bazel @@ -12,14 +12,14 @@ pybind_extension( srcs = ["_re2.cc"], deps = [ "//:re2", - "@com_google_absl//absl/strings", + "@abseil-cpp//absl/strings", ], ) py_library( name = "re2", srcs = ["re2.py"], - data = [":_re2.so"], + data = [":_re2"], imports = ["."], visibility = ["//visibility:public"], ) diff --git a/python/setup.py b/python/setup.py index df65415ee..93de80376 100644 --- a/python/setup.py +++ b/python/setup.py @@ -7,6 +7,7 @@ import setuptools.command.build_ext import shutil import sys +import sysconfig long_description = r"""A drop-in replacement for the re module. @@ -48,9 +49,6 @@ def build_extension(self, ext): if 'GITHUB_ACTIONS' not in os.environ: return super().build_extension(ext) - # For @pybind11_bazel's `python_configure()`. - os.environ['PYTHON_BIN_PATH'] = sys.executable - cmd = ['bazel', 'build'] try: cpu = os.environ['BAZEL_CPU'] @@ -63,8 +61,9 @@ def build_extension(self, ext): cmd.append(f'--extra_toolchains=@local_config_cc//:cc-toolchain-{cpu}') except KeyError: pass - # Register the local Python toolchain with highest priority. - cmd.append('--extra_toolchains=@local_config_python//:py_toolchain') + # Register the local Python toolchains with highest priority. + self.generate_python_toolchains() + cmd.append('--extra_toolchains=//python/toolchains:all') # Print debug information during toolchain resolution. cmd.append('--toolchain_resolution_debug=.*') cmd += ['--compilation_mode=opt', '--', ':all'] @@ -78,6 +77,88 @@ def build_extension(self, ext): cmd = ['bazel', 'clean', '--expunge'] self.spawn(cmd) + def generate_python_toolchains(self): + include = sysconfig.get_path('include') + libs = os.path.join(include, '../libs') + + os.makedirs('toolchains') + shutil.copytree(include, 'toolchains/include') + try: + shutil.copytree(libs, 'toolchains/libs') + except FileNotFoundError: + # We must not be running on Windows. :) + pass + + with open('toolchains/BUILD.bazel', 'x') as file: + file.write( + """\ +load("@rules_python//python/cc:py_cc_toolchain.bzl", "py_cc_toolchain") +load("@rules_python//python:py_runtime.bzl", "py_runtime") +load("@rules_python//python:py_runtime_pair.bzl", "py_runtime_pair") + +package(default_visibility = ["//visibility:public"]) + +toolchain( + name = "py", + toolchain = ":py_toolchain", + toolchain_type = "@rules_python//python:toolchain_type", +) + +py_runtime_pair( + name = "py_toolchain", + py3_runtime = ":interpreter", +) + +py_runtime( + name = "interpreter", + interpreter_path = "{interpreter_path}", + interpreter_version_info = {{ + "major": "{major}", + "minor": "{minor}", + }}, + python_version = "PY3", +) + +toolchain( + name = "py_cc", + toolchain = ":py_cc_toolchain", + toolchain_type = "@rules_python//python/cc:toolchain_type", +) + +py_cc_toolchain( + name = "py_cc_toolchain", + headers = ":headers", + libs = ":libraries", + python_version = "{major}.{minor}", +) + +cc_library( + name = "headers", + hdrs = glob(["include/**/*.h"]), + includes = ["include"], + deps = select({{ + "@platforms//os:windows": [":interface_library"], + "//conditions:default": [], + }}), +) + +cc_import( + name = "interface_library", + interface_library = select({{ + "@platforms//os:windows": "libs/python{major}{minor}.lib", + "//conditions:default": None, + }}), + system_provided = True, +) + +# Not actually necessary for our purposes. :) +cc_library( + name = "libraries", +) +""".format(interpreter_path=sys.executable.replace('\\', '/'), + major=sys.version_info.major, + minor=sys.version_info.minor)) + def options(): bdist_wheel = {} diff --git a/re2/parse.cc b/re2/parse.cc index 904599280..2558b2a2e 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -337,6 +337,20 @@ Rune CycleFoldRune(Rune r) { return ApplyFold(f, r); } +// Add lo-hi to the class, along with their fold-equivalent characters. +static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) { + while (lo <= hi) { + cc->AddRange(lo, lo); + if ('A' <= lo && lo <= 'Z') { + cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a'); + } + if ('a' <= lo && lo <= 'z') { + cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A'); + } + lo++; + } +} + // Add lo-hi to the class, along with their fold-equivalent characters. // If lo-hi is already in the class, assume that the fold-equivalent // chars are there too, so there's no work to do. @@ -394,17 +408,26 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { // Pushes the literal rune r onto the stack. bool Regexp::ParseState::PushLiteral(Rune r) { // Do case folding if needed. - if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { - Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); - re->ccb_ = new CharClassBuilder; - Rune r1 = r; - do { - if (!(flags_ & NeverNL) || r != '\n') { - re->ccb_->AddRange(r, r); - } - r = CycleFoldRune(r); - } while (r != r1); - return PushRegexp(re); + if (flags_ & FoldCase) { + if (flags_ & Latin1 && (('A' <= r && r <= 'Z') || + ('a' <= r && r <= 'z'))) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + AddFoldedRangeLatin1(re->ccb_, r, r); + return PushRegexp(re); + } + if (!(flags_ & Latin1) && CycleFoldRune(r) != r) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + Rune r1 = r; + do { + if (!(flags_ & NeverNL) || r != '\n') { + re->ccb_->AddRange(r, r); + } + r = CycleFoldRune(r); + } while (r != r1); + return PushRegexp(re); + } } // Exclude newline if applicable. @@ -776,7 +799,8 @@ Rune* Regexp::LeadingString(Regexp* re, int* nrune, while (re->op() == kRegexpConcat && re->nsub() > 0) re = re->sub()[0]; - *flags = static_cast(re->parse_flags_ & Regexp::FoldCase); + *flags = static_cast(re->parse_flags_ & + (Regexp::FoldCase | Regexp::Latin1)); if (re->op() == kRegexpLiteral) { *nrune = 1; @@ -1175,7 +1199,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, if (re->op() == kRegexpCharClass) { CharClass* cc = re->cc(); for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) - ccb.AddRange(it->lo, it->hi); + ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags()); } else if (re->op() == kRegexpLiteral) { if (re->parse_flags() & Regexp::FoldCase) { // AddFoldedRange() can terminate prematurely if the character class @@ -1194,7 +1218,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, } re->Decref(); } - Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); + Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase); splices->emplace_back(re, sub + start, i - start); } @@ -1622,10 +1646,15 @@ void CharClassBuilder::AddRangeFlags( } // If folding case, add fold-equivalent characters too. - if (parse_flags & Regexp::FoldCase) - AddFoldedRange(this, lo, hi, 0); - else + if (parse_flags & Regexp::FoldCase) { + if (parse_flags & Regexp::Latin1) { + AddFoldedRangeLatin1(this, lo, hi); + } else { + AddFoldedRange(this, lo, hi, 0); + } + } else { AddRange(lo, hi); + } } // Look for a group with the given name. diff --git a/re2/re2.h b/re2/re2.h index 68fbed1d8..7ea44e044 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -972,7 +972,7 @@ inline RE2::Arg RE2::Octal(T* ptr) { } // Silence warnings about missing initializers for members of LazyRE2. -#if !defined(__clang__) && defined(__GNUC__) +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc index 5cddd2334..9e3c94a69 100644 --- a/re2/testing/dump.cc +++ b/re2/testing/dump.cc @@ -96,17 +96,25 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) { break; case kRegexpLiteral: { Rune r = re->rune(); - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); + if (re->parse_flags() & Regexp::Latin1) { + s->push_back(r); + } else { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } break; } case kRegexpLiteralString: for (int i = 0; i < re->nrunes(); i++) { Rune r = re->runes()[i]; - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); + if (re->parse_flags() & Regexp::Latin1) { + s->push_back(r); + } else { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } } break; case kRegexpConcat: diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index 7684b62a4..95294d5ff 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -225,6 +225,29 @@ static Test tests[] = { // Bug in Regexp::ToString() that emitted [^], which // would (obviously) fail to parse when fed back in. { "[\\s\\S]", "cc{0-0x10ffff}" }, + + // As per https://github.com/google/re2/issues/477, + // there were long-standing bugs involving Latin-1. + // Here, we exercise it WITHOUT case folding... + { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 }, + { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 }, + { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 }, + { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 }, + { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 }, + { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 }, + { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 }, + { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 }, + // Here, we exercise it WITH case folding... + // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2 + // should fold to 0xF1 and 0xF2, respectively. + { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, }; bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { @@ -492,7 +515,7 @@ TEST(TestToString, EquivalentParse) { // << " t=" << t << " regexp=" << tests[i].regexp; // Test that if we parse the new regexp we get the same structure. - Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); + Regexp* nre = Regexp::Parse(t, f, &status); ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text(); std::string ss = nre->Dump(); std::string tt = nre->ToString(); diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 151525f2d..ddf8dbf8f 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -1658,4 +1658,23 @@ TEST(RE2, Issue310) { ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; } +TEST(RE2, Issue477) { + // Regexp::LeadingString didn't output Latin1 into flags. + // In the given pattern, 0xA5 should be factored out, but + // shouldn't lose its Latin1-ness in the process. Because + // that was happening, the prefix for accel was 0xC2 0xA5 + // instead of 0xA5. Note that the former doesn't occur in + // the given input and so replacements weren't occurring. + + const char bytes[] = { + (char)0xa5, (char)0xd1, (char)0xa5, (char)0xd1, + (char)0x61, (char)0x63, (char)0xa5, (char)0x64, + }; + std::string s(bytes, ABSL_ARRAYSIZE(bytes)); + RE2 re("\xa5\xd1|\xa5\x64", RE2::Latin1); + int n = RE2::GlobalReplace(&s, re, ""); + ASSERT_EQ(n, 3); + ASSERT_EQ(s, "\x61\x63"); +} + } // namespace re2 diff --git a/util/pcre.cc b/util/pcre.cc index f54cb28f8..27aee3dc4 100644 --- a/util/pcre.cc +++ b/util/pcre.cc @@ -21,7 +21,7 @@ #include "util/pcre.h" // Silence warnings about the wacky formatting in the operator() functions. -#if !defined(__clang__) && defined(__GNUC__) +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wmisleading-indentation" #endif