From 662f467821a92052d2216cd212e487fa5fa76be9 Mon Sep 17 00:00:00 2001 From: SirStendec Date: Wed, 22 Sep 2021 15:13:59 -0400 Subject: [PATCH 1/5] Add initial support for ICU MessageFormat. * Implements a fairly comprehensive checking class. * Adds documentation about the check to admin/checks.rst and user/checks.rst * Includes a new dependency on `pyicumessageformat`. --- docs/admin/checks.rst | 8 + docs/user/checks.rst | 67 +++- requirements.txt | 1 + weblate/checks/flags.py | 4 + weblate/checks/icu.py | 510 ++++++++++++++++++++++++ weblate/checks/models.py | 1 + weblate/checks/tests/test_checks.py | 5 +- weblate/checks/tests/test_icu_checks.py | 396 ++++++++++++++++++ weblate/settings_docker.py | 1 + weblate/settings_example.py | 1 + 10 files changed, 992 insertions(+), 2 deletions(-) create mode 100644 weblate/checks/icu.py create mode 100644 weblate/checks/tests/test_icu_checks.py diff --git a/docs/admin/checks.rst b/docs/admin/checks.rst index a89de11225a5..d8162bf889a6 100644 --- a/docs/admin/checks.rst +++ b/docs/admin/checks.rst @@ -62,6 +62,10 @@ Here is a list of flags currently accepted: Define font-size for rendering checks, see :ref:`fonts`. ``font-spacing:SPACING`` Define letter spacing for rendering checks, see :ref:`fonts`. +``icu-flags:FLAGS`` + Define flags for customizing the behavior of the :ref:`check-icu-message-format` quality check. +``icu-tag-prefix:PREFIX`` + Set a required prefix for XML tags for the :ref:`check-icu-message-format` quality check. ``placeholders:NAME:NAME2:...`` Placeholder strings expected in translation, see :ref:`check-placeholders`. ``replacements:FROM:TO:FROM2:TO2...`` @@ -89,6 +93,8 @@ Here is a list of flags currently accepted: Enable the :ref:`check-es-format` quality check. ``i18next-interpolation`` Enable the :ref:`check-i18next-interpolation` quality check. +``icu-message-format`` + Enable the :ref:`check-icu-message-format` quality check. ``java-format`` Enable the :ref:`check-java-format` quality check. ``java-messageformat`` @@ -145,6 +151,8 @@ Here is a list of flags currently accepted: Skip the :ref:`check-es-format` quality check. ``ignore-i18next-interpolation`` Skip the :ref:`check-i18next-interpolation` quality check. +``ignore-icu-message-format`` + Skip the :ref:`check-icu-message-format` quality check. ``ignore-java-format`` Skip the :ref:`check-java-format` quality check. ``ignore-java-messageformat`` diff --git a/docs/user/checks.rst b/docs/user/checks.rst index eab69d7b0f88..d7b8fc10db27 100644 --- a/docs/user/checks.rst +++ b/docs/user/checks.rst @@ -242,8 +242,73 @@ i18next interpolation :ref:`check-formats`, `i18next interpolation `_ -.. _check-java-format: +.. _check-icu-message-format: + +ICU MessageFormat +***************** + +.. versionadded:: 4.9 + +:Summary: There are inconsistencies or syntax errors within ICU MessageFormat placeholders. +:Scope: translated strings +:Check class: ``weblate.checks.icu.ICUMessageFormatCheck`` +:Flag to enable: ``icu-message-format`` +:Flag to ignore: ``ignore-icu-message-format`` +:Interpolation example: ``There {number, plural, one {is one apple} other {are # apples}}.`` + +This check has support for both pure ICU MessageFormat messages as well as ICU with simple +XML tags. You can configure the behavior of this check by using ``icu-flags:*``, either by +opting into XML support or by disabling certain sub-checks. For example, the following flag +enables XML support while disabling validation of plural sub-messages: + +.. code-block::text + + icu-message-format, icu-flags:xml:-plural_selectors + ++---------------------------+------------------------------------------------------------+ +| ``xml`` | Enable support for simple XML tags. By default, XML tags | +| | are parsed loosely. Stray ``<`` characters are ignored | +| | if they are not reasonably part of a tag. | ++---------------------------+------------------------------------------------------------+ +| ``strict-xml`` | Enable support for strict XML tags. All ``<`` characters | +| | must be escaped if they are not part of a tag. | ++---------------------------+------------------------------------------------------------+ +| ``-highlight`` | Disable highlighting placeholders in the editor. | ++---------------------------+------------------------------------------------------------+ +| ``-require_other`` | Disable requiring sub-messages to have an ``other`` | +| | selector. | ++---------------------------+------------------------------------------------------------+ +| ``-submessage_selectors`` | Skip checking that sub-message selectors match the source. | ++---------------------------+------------------------------------------------------------+ +| ``-types`` | Skip checking that placeholder types match the source. | ++---------------------------+------------------------------------------------------------+ +| ``-extra`` | Skip checking that no placeholders are present that were | +| | not present in the source string. | ++---------------------------+------------------------------------------------------------+ +| ``-missing`` | Skip checking that no placeholders are missing that were | +| | present in the source string. | ++---------------------------+------------------------------------------------------------+ + +Additionally, when ``strict-xml`` is not enabled but ``xml`` is enabled, you can use the +``icu-tag-prefix:PREFIX`` flag to require that all XML tags start with a specific string. +For example, the following flag will only allow XML tags to be matched if they start with +``click here`` but not ``this``. + +.. seealso:: + + :ref:`check-formats`, + `ICU: Formatting Messages `_, + `Format.JS: Message Syntax `_ + + +.. _check-java-format: Java format *********** diff --git a/requirements.txt b/requirements.txt index 20e5745c2c73..d547f95a6fc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,6 +24,7 @@ Pillow>=6.0.0,<8.4.0 pyahocorasick>=1.4,<1.5 pycairo>=1.15.3 pygobject>=3.27.0 +pyicumessageformat>=1.0.0 pyparsing>=2.4.0,<2.5.0 python-dateutil>=2.8.1 python-redis-lock>=3.6.0,<3.8.0 diff --git a/weblate/checks/flags.py b/weblate/checks/flags.py index b993c3e6b445..56e3ba0ad156 100644 --- a/weblate/checks/flags.py +++ b/weblate/checks/flags.py @@ -65,6 +65,10 @@ TYPED_FLAGS_ARGS["font-weight"] = single_value_flag(get_font_weight) TYPED_FLAGS["font-spacing"] = gettext_lazy("Font spacing") TYPED_FLAGS_ARGS["font-spacing"] = single_value_flag(int) +TYPED_FLAGS["icu-flags"] = gettext_lazy("ICU MessageFormat Flags") +TYPED_FLAGS_ARGS["icu-flags"] = multi_value_flag(str) +TYPED_FLAGS["icu-tag-prefix"] = gettext_lazy("ICU MessageFormat Tag Prefix") +TYPED_FLAGS_ARGS["icu-tag-prefix"] = single_value_flag(str) TYPED_FLAGS["priority"] = gettext_lazy("Priority") TYPED_FLAGS_ARGS["priority"] = single_value_flag(int) TYPED_FLAGS["max-length"] = gettext_lazy("Maximum length of translation") diff --git a/weblate/checks/icu.py b/weblate/checks/icu.py new file mode 100644 index 000000000000..2312985597ab --- /dev/null +++ b/weblate/checks/icu.py @@ -0,0 +1,510 @@ +# +# Copyright © 2012 - 2021 Michal Čihař +# +# This file is part of Weblate +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from collections import defaultdict + +from django.utils.translation import gettext_lazy as _ +from pyicumessageformat import Parser + +from weblate.checks.format import BaseFormatCheck + +# Unique value for checking tags. Since types are +# always strings, this will never be encountered. +TAG_TYPE = -100 + +# These types are to be considered numeric. Numeric placeholders +# can be of any numeric type without triggering a warning from +# the checker. +NUMERIC_TYPES = ["number", "plural", "selectordinal"] + +# These types have their sub-messages checked to ensure that +# sub-message selectors are valid. +PLURAL_TYPES = ["plural", "selectordinal"] + +# ... and these are the valid selectors, along with selectors +# for specific values, formatted such as: =0, =1, etc. +PLURAL_SELECTORS = ["zero", "one", "two", "few", "many", "other"] + + +# We construct two Parser instances, one for tags and one without. +# Both parsers are configured to allow spaces inside formats, to not +# require other (which we can do better ourselves), and to be +# permissive about what types can have sub-messages. +standard_parser = Parser( + { + "include_indices": True, + "loose_submessages": True, + "allow_format_spaces": True, + "require_other": False, + "allow_tags": False, + } +) + +tag_parser = Parser( + { + "include_indices": True, + "loose_submessages": True, + "allow_format_spaces": True, + "require_other": False, + "allow_tags": True, + "strict_tags": False, + "tag_type": TAG_TYPE, + } +) + + +strict_tag_parser = Parser( + { + "include_indices": True, + "loose_submessages": True, + "allow_format_spaces": True, + "require_other": False, + "allow_tags": True, + "strict_tags": True, + "tag_type": TAG_TYPE, + } +) + + +def parse_icu( + source: str, + allow_tags: bool, + strict_tags: bool, + tag_prefix: str = None, + want_tokens=False, +): + """Parse an ICU MessageFormat message.""" + ast = None + err = None + tokens = [] if want_tokens else None + parser = standard_parser + if allow_tags: + parser = strict_tag_parser if strict_tags else tag_parser + + parser.options["tag_prefix"] = tag_prefix + + try: + ast = parser.parse(source, tokens) + except SyntaxError as e: + err = e + + parser.options["tag_prefix"] = None + + return ast, err, tokens + + +def check_bad_plural_selector(selector): + if selector in PLURAL_SELECTORS: + return False + return selector[0] != "=" + + +def update_maybe_value(value, old): + """ + Certain values on placeholders can have four values. + + They will be one of: `None`, `True`, `False`, or `0`. + + `None` represents a value that was never set. + `True` or `False` represents a value that was set. + `0` represents a value that was set with conflicting values. + + This is useful in case there are multiple placeholders with + conflicting type information. + """ + if old is None or old == value: + return value + return 0 + + +def extract_highlights(token, source, out=None): + """Extract all placeholders from an AST that we would want to highlight.""" + if out is None: + out = [] + + if isinstance(token, str): + return out + + if isinstance(token, list): + for tok in token: + extract_highlights(tok, source, out) + + return out + + # Sanity check the token. They should always have + # start and end. + if "start" not in token or "end" not in token: + return out + + start = token["start"] + end = token["end"] + usable = start < len(source) + + if "hash" in token and token["hash"]: + usable = False + + if "options" in token: + usable = False + for subast in token["options"].values(): + extract_highlights(subast, source, out) + + if "contents" in token: + usable = False + extract_highlights(token["contents"], source, out) + + if usable: + out.append((start, end, source[start:end])) + + return out + + +def extract_placeholders(token, variables=None): + """Extract all placeholders from an AST and summarize their types.""" + if variables is None: + variables = {} + + if isinstance(token, str): + # Skip strings. Those aren't interesting. + return variables + + if isinstance(token, list): + # If we have a list, then we have a list of tokens so iterate + # over the entire list. + for tok in token: + extract_placeholders(tok, variables) + + return variables + + if "name" not in token: + # There should always be a name. This is highly suspicious. + # Should this raise an exception? + return variables + + name = token["name"] + ttype = token.get("type") + data = variables.setdefault( + name, + { + "name": name, + "types": set(), + "formats": set(), + "is_number": None, + "is_tag": None, + "is_empty": None, + }, + ) + + if ttype: + is_tag = ttype is TAG_TYPE + data["is_tag"] = update_maybe_value(is_tag, data["is_tag"]) + + if is_tag: + data["is_empty"] = update_maybe_value( + "contents" not in token or not token["contents"], data["is_empty"] + ) + else: + data["types"].add(ttype) + data["is_number"] = update_maybe_value( + ttype in NUMERIC_TYPES, data["is_number"] + ) + if "format" in token: + data["formats"].add(token["format"]) + + if "options" in token: + choices = data.setdefault("choices", set()) + + # We need to do three things with options: + for selector, subast in token["options"].items(): + # First, we log the selector for later comparison. + choices.add(selector) + + # Second, we make sure the selector is valid if we're working + # with a plural/selectordinal type. + if ttype in PLURAL_TYPES: + if check_bad_plural_selector(selector): + data.setdefault("bad_plural", set()).add(selector) + + # Finally, we process the sub-ast for this option. + extract_placeholders(subast, variables) + + # Make sure we process the contents sub-ast if one exists. + if "contents" in token: + extract_placeholders(token["contents"], variables) + + return variables + + +class ICUMessageFormatCheck(BaseFormatCheck): + """Check for ICU MessageFormat string.""" + + check_id = "icu_message_format" + name = _("ICU MessageFormat") + description = _( + "Syntax errors and/or placeholder mismatches in ICU MessageFormat strings." + ) + source = True + + def get_flags(self, unit): + if unit and unit.all_flags.has_value("icu-flags"): + return unit.all_flags.get_value("icu-flags") + return [] + + def get_tag_prefix(self, unit): + if unit and unit.all_flags.has_value("icu-tag-prefix"): + return unit.all_flags.get_value("icu-tag-prefix") + return None + + def check_source_unit(self, source, unit): + """Checker for source strings. Only check for syntax issues.""" + if not source or not source[0]: + return False + + flags = self.get_flags(unit) + strict_tags = "strict-xml" in flags + allow_tags = strict_tags or "xml" in flags + tag_prefix = self.get_tag_prefix(unit) + + _, src_err, _ = parse_icu(source[0], allow_tags, strict_tags, tag_prefix) + if src_err: + return True + return False + + def check_format(self, source, target, ignore_missing, unit): + """Checker for ICU MessageFormat strings.""" + if not target or not source: + return False + + flags = self.get_flags(unit) + strict_tags = "strict-xml" in flags + allow_tags = strict_tags or "xml" in flags + tag_prefix = self.get_tag_prefix(unit) + + result = defaultdict(list) + src_ast, src_err, _ = parse_icu(source, allow_tags, strict_tags, tag_prefix) + + # Check to see if we're running on a source string only. + # If we are, then we can only run a syntax check on the + # source and be done. + if unit and unit.is_source: + if src_err: + result["syntax"].append(src_err) + return result + return False + + tgt_ast, tgt_err, _ = parse_icu(target, allow_tags, strict_tags, tag_prefix) + if tgt_err: + result["syntax"].append(tgt_err) + + if tgt_err: + return result + if src_err: + # We cannot run any further checks if the source + # string isn't valid, so just accept that the target + # string is valid for now. + return False + + # Both strings are valid! Congratulations. Let's extract + # information on all the placeholders in both strings, and + # compare them to see if anything is wrong. + src_vars = extract_placeholders(src_ast) + tgt_vars = extract_placeholders(tgt_ast) + + # First, we check all the variables in the target. + for name, data in tgt_vars.items(): + self.check_for_other(result, name, data, flags) + + if name in src_vars: + src_data = src_vars[name] + + self.check_bad_plural(result, name, data, src_data, flags) + self.check_bad_submessage(result, name, data, src_data, flags) + self.check_wrong_type(result, name, data, src_data, flags) + + if allow_tags: + self.check_tags(result, name, data, src_data, flags) + + else: + self.check_bad_submessage(result, name, data, None, flags) + + # The variable does not exist in the source, + # which suggests a mistake. + if "-extra" not in flags: + result["extra"].append(name) + + # We also want to check for variables used in the + # source but not in the target. + self.check_missing(result, src_vars, tgt_vars, flags) + + if result: + return result + return False + + def check_missing(self, result, src_vars, tgt_vars, flags): + """Detect any variables in the target not in the source.""" + if "-missing" in flags: + return + + for name in src_vars: + if name not in tgt_vars: + result["missing"].append(name) + + def check_for_other(self, result, name, data, flags): + """Ensure that types with sub-messages have other.""" + if "-require_other" in flags: + return + + choices = data.get("choices") + if choices and "other" not in choices: + result["no_other"].append(name) + + def check_bad_plural(self, result, name, data, src_data, flags): + """Forward bad plural selectors detected during extraction.""" + if "-plural_selectors" in flags: + return + + if "bad_plural" in data: + result["bad_plural"].append([name, data["bad_plural"]]) + + def check_bad_submessage(self, result, name, data, src_data, flags): + """Detect any bad sub-message selectors.""" + if "-submessage_selectors" in flags: + return + + bad = set() + + # We also want to check individual select choices. + if src_data and "select" in data["types"] and "select" in src_data["types"]: + if "choices" in data and "choices" in src_data: + choices = data["choices"] + src_choices = src_data["choices"] + + for selector in choices: + if selector not in src_choices: + bad.add(selector) + + if bad: + result["bad_submessage"].append([name, bad]) + + def check_wrong_type(self, result, name, data, src_data, flags): + """Ensure that types match, when possible.""" + if "-types" in flags: + return + + # If we're dealing with a number, we want to use + # special number logic, since numbers work with + # multiple types. + if isinstance(src_data["is_number"], bool) and src_data["is_number"]: + if src_data["is_number"] != data["is_number"]: + result["wrong_type"].append(name) + + else: + for ttype in data["types"]: + if ttype not in src_data["types"]: + result["wrong_type"].append(name) + break + + def check_tags(self, result, name, data, src_data, flags): + """Check for errors with XML tags.""" + if "-tags" in flags: + return + + if isinstance(src_data["is_tag"], bool) or data["is_tag"] is not None: + if src_data["is_tag"]: + if not data["is_tag"]: + result["should_be_tag"].append(name) + + elif ( + isinstance(src_data["is_empty"], bool) + and src_data["is_empty"] != data["is_empty"] + ): + if src_data["is_empty"]: + result["tag_not_empty"].append(name) + else: + result["tag_empty"].append(name) + + elif data["is_tag"]: + result["not_tag"].append(name) + + def format_result(self, result): + if result.get("syntax"): + yield _("Syntax error: %s") % ", ".join(err.msg for err in result["syntax"]) + + if result.get("extra"): + yield _("Unknown placeholder in translation: %s") % ", ".join( + result["extra"] + ) + + if result.get("missing"): + yield _("Placeholder missing in translation: %s") % ", ".join( + result["missing"] + ) + + if result.get("wrong_type"): + yield _("Placeholder has wrong type: %s") % ", ".join(result["wrong_type"]) + + if result.get("no_other"): + yield _("Missing other sub-message for: %s") % ", ".join(result["no_other"]) + + if result.get("bad_plural"): + yield _("Bad plural selectors for: %s") % ", ".join( + f"{x[0]} ({', '.join(x[1])})" for x in result["bad_plural"] + ) + + if result.get("bad_submessage"): + yield _("Bad sub-message selectors for: %s") % ", ".join( + f"{x[0]} ({', '.join(x[1])})" for x in result["bad_submessage"] + ) + + if result.get("should_be_tag"): + yield _("Placeholder should be XML tag in translation: %s") % ", ".join( + result["should_be_tag"] + ) + + if result.get("not_tag"): + yield _("Placeholder should not be XML tag in translation: %s") % ", ".join( + result["not_tag"] + ) + + if result.get("tag_not_empty"): + yield _("XML Tag has unexpected contents in translation: %s") % ", ".join( + result["tag_not_empty"] + ) + + if result.get("tag_empty"): + yield _("XML Tag missing contents in translation: %s") % ", ".join( + result["tag_empty"] + ) + + def check_highlight(self, source, unit): + if self.should_skip(unit): + return [] + + flags = self.get_flags(unit) + if "-highlight" in flags: + return [] + + strict_tags = "strict-xml" in flags + allow_tags = strict_tags or "xml" in flags + tag_prefix = self.get_tag_prefix(unit) + + ast, _, _ = parse_icu(source, allow_tags, strict_tags, tag_prefix, True) + if not ast: + return [] + + return extract_highlights(ast, source) diff --git a/weblate/checks/models.py b/weblate/checks/models.py index 496936e1711a..89d99dafd8aa 100644 --- a/weblate/checks/models.py +++ b/weblate/checks/models.py @@ -76,6 +76,7 @@ class WeblateChecksConf(AppConf): "weblate.checks.format.I18NextInterpolationCheck", "weblate.checks.format.ESTemplateLiteralsCheck", "weblate.checks.angularjs.AngularJSInterpolationCheck", + "weblate.checks.icu.ICUMessageFormatCheck", "weblate.checks.qt.QtFormatCheck", "weblate.checks.qt.QtPluralCheck", "weblate.checks.ruby.RubyFormatCheck", diff --git a/weblate/checks/tests/test_checks.py b/weblate/checks/tests/test_checks.py index ec7c2bf888fd..ccd204e9a1c6 100644 --- a/weblate/checks/tests/test_checks.py +++ b/weblate/checks/tests/test_checks.py @@ -75,7 +75,9 @@ def log_debug(text, *args): class MockUnit: """Mock unit object.""" - def __init__(self, id_hash=None, flags="", code="cs", source="", note=""): + def __init__( + self, id_hash=None, flags="", code="cs", source="", note="", is_source=None + ): if id_hash is None: id_hash = random.randint(0, 65536) self.id_hash = id_hash @@ -89,6 +91,7 @@ def __init__(self, id_hash=None, flags="", code="cs", source="", note=""): self.note = note self.check_cache = {} self.machinery = {"best": -1} + self.is_source = is_source @property def all_flags(self): diff --git a/weblate/checks/tests/test_icu_checks.py b/weblate/checks/tests/test_icu_checks.py new file mode 100644 index 000000000000..7c84cc26b5ce --- /dev/null +++ b/weblate/checks/tests/test_icu_checks.py @@ -0,0 +1,396 @@ +# +# Copyright © 2012 - 2021 Michal Čihař +# +# This file is part of Weblate +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +"""Tests for ICU MessageFormat checks.""" + +from weblate.checks.icu import ICUMessageFormatCheck +from weblate.checks.tests.test_checks import CheckTestCase, MockUnit + + +class ICUMessageFormatCheckTest(CheckTestCase): + check = ICUMessageFormatCheck() + + id_hash = "icu_message_format" + flag = "icu-message-format" + flags = None + + def get_mock(self, source=None, flags=None): + if not flags and self.flags: + flags = self.flags + elif flags and self.flags: + flags = f"{self.flags}:{flags}" + + if flags: + flags = f"{self.flag}, icu-flags:{flags}" + else: + flags = self.flag + + return MockUnit( + self.id_hash, flags=flags, source=source, is_source=source is not None + ) + + def test_plain(self): + self.assertFalse( + self.check.check_format("string", "string", False, self.get_mock()) + ) + + def test_plain_source(self): + self.assertFalse( + self.check.check_format("string", "string", False, self.get_mock("string")) + ) + + def test_malformed(self): + result = self.check.check_format( + "Hello, {name}!", "Hello, {name!", False, self.get_mock() + ) + + self.assertTrue(result) + self.assertTrue("syntax" in result) + syntax = result["syntax"] + self.assertTrue(isinstance(syntax, list) and len(syntax) == 1) + self.assertTrue("Expected , or }" in syntax[0].msg) + + def test_malformed_source(self): + # When dealing with a translation and not the source, + # any source issue is silently discarded. + self.assertFalse( + self.check.check_format( + "Hello, {name!", "Hello, {name}!", False, self.get_mock() + ) + ) + + # However, if the unit is_source, we return syntax errors. + result = self.check.check_format( + "Hello, {name!", "Hello, {name}!", False, self.get_mock("Hello, {name!") + ) + + self.assertTrue(result) + self.assertTrue("syntax" in result) + syntax = result["syntax"] + self.assertTrue(isinstance(syntax, list) and len(syntax) == 1) + self.assertTrue("Expected , or }" in syntax[0].msg) + + def test_source(self): + self.assertFalse(self.check.check_source_unit([""], self.get_mock())) + self.assertFalse( + self.check.check_source_unit(["Hello, {name}!"], self.get_mock()) + ) + + def test_bad_source(self): + self.assertTrue( + self.check.check_source_unit(["Hello, {name!"], self.get_mock()) + ) + + def test_no_formats(self): + self.assertFalse( + self.check.check_format( + "Hello, {name}!", "Hallo, {name}!", False, self.get_mock() + ) + ) + + def test_whitespace(self): + self.assertFalse( + self.check.check_format( + "Hello, { \t name\n \n}!", "Hallo, {name}!", False, self.get_mock() + ) + ) + + def test_missing_placeholder(self): + result = self.check.check_format( + "Hello, {name}!", "Hallo, Fred!", False, self.get_mock() + ) + + self.assertDictEqual(result, {"missing": ["name"]}) + + def test_extra_placeholder(self): + result = self.check.check_format( + "Hello, {firstName}!", + "Hallo, {firstName} {lastName}!", + False, + self.get_mock(), + ) + + self.assertDictEqual(result, {"extra": ["lastName"]}) + + def test_types(self): + self.assertFalse( + self.check.check_format( + "Cost: {value, number, ::currency/USD}", + "Kosten: {value, number, ::currency/USD}", + False, + self.get_mock(), + ) + ) + + def test_wrong_types(self): + result = self.check.check_format( + "Cost: {value, number, ::currency/USD}", + "Kosten: {value}", + False, + self.get_mock(), + ) + + self.assertDictEqual(result, {"wrong_type": ["value"]}) + + def test_flag_wrong_types(self): + self.assertFalse( + self.check.check_format( + "{value, number}", "{value}", False, self.get_mock(None, "-types") + ) + ) + + def test_more_wrong_types(self): + result = self.check.check_format( + "Cost: {value, foo}", "Kosten: {value, bar}", False, self.get_mock() + ) + + self.assertDictEqual(result, {"wrong_type": ["value"]}) + + def test_plural_types(self): + self.assertFalse( + self.check.check_format( + "You have {count, plural, one {# message} other {# messages}}. " + "Yes. {count, number}.", + "Sie haben {count, plural, one {# Nachricht} other " + "{# Nachrichten}}. Ja. {count, number}.", + False, + self.get_mock(), + ) + ) + + def test_no_other(self): + result = self.check.check_format( + "{count, number}", "{count, plural, one {typo}}", False, self.get_mock() + ) + + self.assertDictEqual(result, {"no_other": ["count"]}) + + def test_flag_no_other(self): + inp = "{count, plural, one {#}}" + self.assertFalse( + self.check.check_format( + inp, inp, False, self.get_mock(inp, "-require_other") + ) + ) + + def test_random_submessage(self): + inp = "{count, test, one {yes} other {no}}" + self.assertFalse(self.check.check_format(inp, inp, False, self.get_mock())) + + def test_bad_select(self): + result = self.check.check_format( + "{pronoun,select,hehim{}sheher{}other{}}", + "{pronoun,select,he{}sheeher{}other{}}", + False, + self.get_mock(), + ) + + self.assertDictEqual( + result, {"bad_submessage": [["pronoun", {"he", "sheeher"}]]} + ) + + def test_flag_bad_select(self): + # This also checks multiple flags. + self.assertFalse( + self.check.check_format( + "{n,select,one{}two{}}", + "{n,select,three{}four{}}", + False, + self.get_mock(None, "-require_other:-submessage_selectors"), + ) + ) + + def test_bad_plural(self): + result = self.check.check_format( + "{count, number}", + "{count, plural, bad {typo} other {okay}}", + False, + self.get_mock(), + ) + + self.assertDictEqual(result, {"bad_plural": [["count", {"bad"}]]}) + + def test_flag_bad_plural(self): + self.assertFalse( + self.check.check_format( + "{n,number}", + "{n,plural,bad{}other{}}", + False, + self.get_mock(None, "-plural_selectors"), + ) + ) + + def test_good_plural(self): + self.assertFalse( + self.check.check_format( + "{count, number}", + "{count, plural, zero{#} one{#} two{#} few{#} many{#} " + "other{#} =0{#} =-12{#} =391.5{#}}", + False, + self.get_mock(), + ) + ) + + def test_check_highlight(self): + highlights = self.check.check_highlight( + "Hello, {na<>me} . You have {count, plural, one " + "{# message} other {# messages}}.", + self.get_mock(), + ) + + self.assertListEqual( + highlights, + [ + (14, 22, "{na<>me}"), + ], + ) + + def test_check_error_highlight(self): + highlights = self.check.check_highlight( + "Hello, {name}! You have {count,number", self.get_mock() + ) + + self.assertListEqual(highlights, []) + + def test_check_flag_highlight(self): + highlights = self.check.check_highlight( + "Hello, {name}! You have {count,number", self.get_mock(None, "-highlight") + ) + + self.assertListEqual(highlights, []) + + def test_check_no_highlight(self): + highlights = self.check.check_highlight( + "Hello, {name}!", MockUnit("java_format", flags="java-format") + ) + + self.assertListEqual(highlights, []) + + +# This is a sub-class of our existing test set because this format is an extension +# of the other format and it should handle all existing syntax properly. +class ICUXMLFormatCheckTest(ICUMessageFormatCheckTest): + + flags = "xml" + + def test_tags(self): + self.assertFalse( + self.check.check_format( + "Hello ! Click here!", + "Hallo ! Klicke hier!", + False, + None, + ) + ) + + def test_empty_tags(self): + self.assertFalse( + self.check.check_format( + "", "", False, self.get_mock() + ) + ) + + def test_incorrectly_full_tags(self): + result = self.check.check_format( + "tag", + "tag", + False, + self.get_mock(), + ) + + self.assertDictEqual( + result, {"tag_not_empty": ["empty"], "tag_empty": ["full"]} + ) + + def test_tag_vs_placeholder(self): + result = self.check.check_format( + "Hello, {firstName}.", + "Hello {bold} .", + False, + self.get_mock(), + ) + + self.assertDictEqual( + result, + { + "should_be_tag": ["bold"], + "not_tag": ["firstName"], + }, + ) + + def test_flag_tags(self): + self.assertFalse( + self.check.check_format( + "Hello, {firstName}.", + "Hello {bold} .", + False, + self.get_mock(None, "-tags"), + ) + ) + + def test_check_highlight(self): + highlights = self.check.check_highlight( + "Hello, {na<>me} . You have {count, plural, " + "one {# message} other {# messages}}.", + self.get_mock(), + ) + + self.assertListEqual( + highlights, + [ + (14, 22, "{na<>me}"), + ], + ) + + def test_not_a_tag(self): + self.assertFalse( + self.check.check_format( + "I <3 Software", "I <3 Open Source", False, self.get_mock() + ) + ) + + def test_tag_prefix(self): + self.assertFalse( + self.check.check_format( + "test", + "test", + False, + self.get_mock(None, 'xml, icu-tag-prefix:"x:"'), + ) + ) + + +class ICUXMLStrictFormatCheckTest(ICUXMLFormatCheckTest): + + flags = "strict-xml" + + def test_tag_prefix(self): + # Tag Prefix is ignored with strict tags. + pass + + def test_not_a_tag(self): + result = self.check.check_format( + "I <3 Software", "I <3 Open Source", False, self.get_mock() + ) + + self.assertTrue(result) + self.assertTrue("syntax" in result) + syntax = result["syntax"] + self.assertTrue(isinstance(syntax, list) and len(syntax) == 1) + self.assertTrue("Expected > or />" in syntax[0].msg) diff --git a/weblate/settings_docker.py b/weblate/settings_docker.py index 4942f93bea91..cfbfd200057b 100644 --- a/weblate/settings_docker.py +++ b/weblate/settings_docker.py @@ -979,6 +979,7 @@ "weblate.checks.format.I18NextInterpolationCheck", "weblate.checks.format.ESTemplateLiteralsCheck", "weblate.checks.angularjs.AngularJSInterpolationCheck", + "weblate.checks.icu.ICUMessageFormatCheck", "weblate.checks.qt.QtFormatCheck", "weblate.checks.qt.QtPluralCheck", "weblate.checks.ruby.RubyFormatCheck", diff --git a/weblate/settings_example.py b/weblate/settings_example.py index 0b06e418441d..54dcfc250d62 100644 --- a/weblate/settings_example.py +++ b/weblate/settings_example.py @@ -722,6 +722,7 @@ # "weblate.checks.format.I18NextInterpolationCheck", # "weblate.checks.format.ESTemplateLiteralsCheck", # "weblate.checks.angularjs.AngularJSInterpolationCheck", +# "weblate.checks.icu.ICUMessageFormatCheck", # "weblate.checks.qt.QtFormatCheck", # "weblate.checks.qt.QtPluralCheck", # "weblate.checks.ruby.RubyFormatCheck", From 477c7deddadba033a69b0bda19b024daae4bb503 Mon Sep 17 00:00:00 2001 From: SirStendec Date: Fri, 24 Sep 2021 11:58:07 -0400 Subject: [PATCH 2/5] Add PrismJS syntax highlighting for ICU MessageFormat messages. --- scripts/yarn-update | 1 + .../vendor/prism/prism-icu-message-format.js | 148 ++++++++++++++++++ weblate/templates/base.html | 1 + weblate/trans/models/unit.py | 2 + 4 files changed, 152 insertions(+) create mode 100644 weblate/static/vendor/prism/prism-icu-message-format.js diff --git a/scripts/yarn-update b/scripts/yarn-update index 1e9569b0d2be..cebbf1ae8bde 100755 --- a/scripts/yarn-update +++ b/scripts/yarn-update @@ -35,6 +35,7 @@ cp node_modules/prismjs/components/prism-core.js ../../weblate/static/vendor/pri cp node_modules/prismjs/components/prism-markup.js ../../weblate/static/vendor/prism/ cp node_modules/prismjs/components/prism-rest.js ../../weblate/static/vendor/prism/ cp node_modules/prismjs/components/prism-markdown.js ../../weblate/static/vendor/prism/ +cp node_modules/prismjs/components/prism-icu-message-format.js ../../weblate/static/vendor/prism/ # Clipboard cp node_modules/clipboard/dist/clipboard.js ../../weblate/static/vendor/ diff --git a/weblate/static/vendor/prism/prism-icu-message-format.js b/weblate/static/vendor/prism/prism-icu-message-format.js new file mode 100644 index 000000000000..7b9de169eb95 --- /dev/null +++ b/weblate/static/vendor/prism/prism-icu-message-format.js @@ -0,0 +1,148 @@ +// https://unicode-org.github.io/icu/userguide/format_parse/messages/ +// https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/MessageFormat.html + +(function (Prism) { + + /** + * @param {string} source + * @param {number} level + * @returns {string} + */ + function nested(source, level) { + if (level <= 0) { + return /[]/.source; + } else { + return source.replace(//g, function () { return nested(source, level - 1); }); + } + } + + var stringPattern = /'[{}:=,](?:[^']|'')*'(?!')/; + + var escape = { + pattern: /''/, + greedy: true, + alias: 'operator' + }; + var string = { + pattern: stringPattern, + greedy: true, + inside: { + 'escape': escape + } + }; + + var argumentSource = nested( + /\{(?:[^{}']|'(?![{},'])|''||)*\}/.source + .replace(//g, function () { return stringPattern.source; }), + 8 + ); + + var nestedMessage = { + pattern: RegExp(argumentSource), + inside: { + 'message': { + pattern: /^(\{)[\s\S]+(?=\}$)/, + lookbehind: true, + inside: null // see below + }, + 'message-delimiter': { + pattern: /./, + alias: 'punctuation' + } + } + }; + + Prism.languages['icu-message-format'] = { + 'argument': { + pattern: RegExp(argumentSource), + greedy: true, + inside: { + 'content': { + pattern: /^(\{)[\s\S]+(?=\}$)/, + lookbehind: true, + inside: { + 'argument-name': { + pattern: /^(\s*)[^{}:=,\s]+/, + lookbehind: true + }, + 'choice-style': { + // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1ChoiceFormat.html#details + pattern: /^(\s*,\s*choice\s*,\s*)\S(?:[\s\S]*\S)?/, + lookbehind: true, + inside: { + 'punctuation': /\|/, + 'range': { + pattern: /^(\s*)[+-]?(?:\d+(?:\.\d*)?|\u221e)\s*[<#\u2264]/, + lookbehind: true, + inside: { + 'operator': /[<#\u2264]/, + 'number': /\S+/ + } + }, + rest: null // see below + } + }, + 'plural-style': { + // https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/PluralFormat.html#:~:text=Patterns%20and%20Their%20Interpretation + pattern: /^(\s*,\s*(?:plural|selectordinal)\s*,\s*)\S(?:[\s\S]*\S)?/, + lookbehind: true, + inside: { + 'offset': /^offset:\s*\d+/, + 'nested-message': nestedMessage, + 'selector': { + pattern: /=\d+|[^{}:=,\s]+/, + inside: { + 'keyword': /^(?:zero|one|two|few|many|other)$/ + } + } + } + }, + 'select-style': { + // https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/SelectFormat.html#:~:text=Patterns%20and%20Their%20Interpretation + pattern: /^(\s*,\s*select\s*,\s*)\S(?:[\s\S]*\S)?/, + lookbehind: true, + inside: { + 'nested-message': nestedMessage, + 'selector': { + pattern: /[^{}:=,\s]+/, + inside: { + 'keyword': /^other$/ + } + } + } + }, + 'keyword': /\b(?:choice|plural|select|selectordinal)\b/, + 'arg-type': { + pattern: /\b(?:number|date|time|spellout|ordinal|duration)\b/, + alias: 'keyword' + }, + 'arg-skeleton': { + pattern: /(,\s*)::[^{}:=,\s]+/, + lookbehind: true + }, + 'arg-style': { + pattern: /(,\s*)(?:short|medium|long|full|integer|currency|percent)(?=\s*$)/, + lookbehind: true + }, + 'arg-style-text': { + pattern: RegExp(/(^\s*,\s*(?=\S))/.source + nested(/(?:[^{}']|'[^']*'|\{(?:)?\})+/.source, 8) + '$'), + lookbehind: true, + alias: 'string' + }, + 'punctuation': /,/ + } + }, + 'argument-delimiter': { + pattern: /./, + alias: 'operator' + } + } + }, + 'escape': escape, + 'string': string + }; + + nestedMessage.inside.message.inside = Prism.languages['icu-message-format']; + Prism.languages['icu-message-format'].argument.inside.content.inside['choice-style'].inside.rest = Prism.languages['icu-message-format']; + +}(Prism)); diff --git a/weblate/templates/base.html b/weblate/templates/base.html index 5bd1b0f761a6..e3547147d0b6 100644 --- a/weblate/templates/base.html +++ b/weblate/templates/base.html @@ -70,6 +70,7 @@ + {% endcompress %} diff --git a/weblate/trans/models/unit.py b/weblate/trans/models/unit.py index 074171c760a8..89dfd093cf26 100644 --- a/weblate/trans/models/unit.py +++ b/weblate/trans/models/unit.py @@ -1314,6 +1314,8 @@ def all_flags(self): def edit_mode(self): """Returns syntax higlighting mode for Prismjs.""" flags = self.all_flags + if "icu-message-format" in flags: + return "icu-message-format" if "rst-text" in flags: return "rest" if "md-text" in flags: From be7b552828c451b71a7eef3d8104b562ad201dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Wed, 27 Oct 2021 09:47:33 +0200 Subject: [PATCH 3/5] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d547f95a6fc2..a0d842ecc2a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ Pillow>=6.0.0,<8.4.0 pyahocorasick>=1.4,<1.5 pycairo>=1.15.3 pygobject>=3.27.0 -pyicumessageformat>=1.0.0 +pyicumessageformat>=1.0.0,<1.1.0 pyparsing>=2.4.0,<2.5.0 python-dateutil>=2.8.1 python-redis-lock>=3.6.0,<3.8.0 From 1a56def17faf5594359b90a251da156c23b90cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Wed, 27 Oct 2021 10:01:11 +0200 Subject: [PATCH 4/5] Apply suggestions from code review --- weblate/checks/icu.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/weblate/checks/icu.py b/weblate/checks/icu.py index 2312985597ab..97685fad2994 100644 --- a/weblate/checks/icu.py +++ b/weblate/checks/icu.py @@ -236,9 +236,8 @@ def extract_placeholders(token, variables=None): # Second, we make sure the selector is valid if we're working # with a plural/selectordinal type. - if ttype in PLURAL_TYPES: - if check_bad_plural_selector(selector): - data.setdefault("bad_plural", set()).add(selector) + if ttype in PLURAL_TYPES and check_bad_plural_selector(selector): + data.setdefault("bad_plural", set()).add(selector) # Finally, we process the sub-ast for this option. extract_placeholders(subast, variables) @@ -389,14 +388,13 @@ def check_bad_submessage(self, result, name, data, src_data, flags): bad = set() # We also want to check individual select choices. - if src_data and "select" in data["types"] and "select" in src_data["types"]: - if "choices" in data and "choices" in src_data: - choices = data["choices"] - src_choices = src_data["choices"] - - for selector in choices: - if selector not in src_choices: - bad.add(selector) + if src_data and "select" in data["types"] and "select" in src_data["types"] and "choices" in data and "choices" in src_data: + choices = data["choices"] + src_choices = src_data["choices"] + + for selector in choices: + if selector not in src_choices: + bad.add(selector) if bad: result["bad_submessage"].append([name, bad]) From 0a002749cba9ee0f905dec5794cc9fe024038b91 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Oct 2021 08:02:33 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- weblate/checks/icu.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/weblate/checks/icu.py b/weblate/checks/icu.py index 97685fad2994..a9e87186c836 100644 --- a/weblate/checks/icu.py +++ b/weblate/checks/icu.py @@ -388,7 +388,13 @@ def check_bad_submessage(self, result, name, data, src_data, flags): bad = set() # We also want to check individual select choices. - if src_data and "select" in data["types"] and "select" in src_data["types"] and "choices" in data and "choices" in src_data: + if ( + src_data + and "select" in data["types"] + and "select" in src_data["types"] + and "choices" in data + and "choices" in src_data + ): choices = data["choices"] src_choices = src_data["choices"]