diff --git a/encoding/htmlindex/gen.go b/encoding/htmlindex/gen.go index 80a52f0d9..ac6b4a77f 100644 --- a/encoding/htmlindex/gen.go +++ b/encoding/htmlindex/gen.go @@ -133,7 +133,10 @@ var consts = map[string]string{ // locales is taken from // https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm. var locales = []struct{ tag, name string }{ - {"und", "windows-1252"}, // The default value. + // The default value. Explicitly state latin to benefit from the exact + // script option, while still making 1252 the default encoding for languages + // written in Latin script. + {"und_Latn", "windows-1252"}, {"ar", "windows-1256"}, {"ba", "windows-1251"}, {"be", "windows-1251"}, diff --git a/encoding/htmlindex/tables.go b/encoding/htmlindex/tables.go index cbf4ba923..9d6b4315c 100644 --- a/encoding/htmlindex/tables.go +++ b/encoding/htmlindex/tables.go @@ -313,7 +313,7 @@ var nameMap = map[string]htmlEncoding{ } var localeMap = []htmlEncoding{ - windows1252, // und + windows1252, // und_Latn windows1256, // ar windows1251, // ba windows1251, // be @@ -349,4 +349,4 @@ var localeMap = []htmlEncoding{ big5, // zh-hant } -const locales = "und ar ba be bg cs el et fa he hr hu ja kk ko ku ky lt lv mk pl ru sah sk sl sr tg th tr tt uk vi zh-hans zh-hant" +const locales = "und_Latn ar ba be bg cs el et fa he hr hu ja kk ko ku ky lt lv mk pl ru sah sk sl sr tg th tr tt uk vi zh-hans zh-hant" diff --git a/language/data_test.go b/language/data_test.go index 738df4678..a1203f2f5 100644 --- a/language/data_test.go +++ b/language/data_test.go @@ -40,7 +40,9 @@ var matchTests = []matchTest{ {"en", "sh"}, {"en", "hr"}, {"en", "bs"}, - {"en", "nl-Cyrl"}, + // TODO: consider if the following match is a good one. + // Due to new script first rule, which maybe should be an option. + {"sr", "nl-Cyrl"}, }, }, { @@ -228,6 +230,14 @@ var matchTests = []matchTest{ {"nl-NL", "nl-NL"}, }, }, + { + "region may replace matched if matched is enclosing", + "es-419,es", + []struct{ match, desired string }{ + {"es-MX", "es-MX"}, + {"es", "es-SG"}, + }, + }, { "more specific region wins over more specific script", "nl, nl-Latn, nl-NL, nl-BE", @@ -265,7 +275,7 @@ var matchTests = []matchTest{ "en, en-GB, es-ES, es-419", []struct{ match, desired string }{ {"en-GB", "en-AU"}, - {"es-419", "es-MX"}, + {"es-MX", "es-MX"}, {"es-ES", "es-PT"}, }, }, @@ -355,6 +365,7 @@ var matchTests = []matchTest{ "en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK", []struct{ match, desired string }{ {"en-GB", "en-150"}, + // {"en-GB", "en-001"}, // TODO: currently en, should probably be en-GB {"en-GB", "en-AU"}, {"en-GB", "en-BE"}, {"en-GB", "en-GG"}, @@ -370,26 +381,27 @@ var matchTests = []matchTest{ {"en-GB", "en-SG"}, {"en-GB", "en-DE"}, {"en-GB", "en-MT"}, - {"es-419", "es-AR"}, - {"es-419", "es-BO"}, - {"es-419", "es-CL"}, - {"es-419", "es-CO"}, - {"es-419", "es-CR"}, - {"es-419", "es-CU"}, - {"es-419", "es-DO"}, - {"es-419", "es-EC"}, - {"es-419", "es-GT"}, - {"es-419", "es-HN"}, - {"es-419", "es-MX"}, - {"es-419", "es-NI"}, - {"es-419", "es-PA"}, - {"es-419", "es-PE"}, - {"es-419", "es-PR"}, - {"es-419", "es-PY"}, - {"es-419", "es-SV"}, - {"es-419", "es-US"}, - {"es-419", "es-UY"}, - {"es-419", "es-VE"}, + {"es-AR", "es-AR"}, + {"es-BO", "es-BO"}, + {"es-CL", "es-CL"}, + {"es-CO", "es-CO"}, + {"es-CR", "es-CR"}, + {"es-CU", "es-CU"}, + {"es-DO", "es-DO"}, + {"es-EC", "es-EC"}, + {"es-GT", "es-GT"}, + {"es-HN", "es-HN"}, + {"es-MX", "es-MX"}, + {"es-NI", "es-NI"}, + {"es-PA", "es-PA"}, + {"es-PE", "es-PE"}, + {"es-PR", "es-PR"}, + {"es", "es-PT"}, + {"es-PY", "es-PY"}, + {"es-SV", "es-SV"}, + {"es-419", "es-US"}, // US is not in Latin America, so don't make more specific. + {"es-UY", "es-UY"}, + {"es-VE", "es-VE"}, {"pt-PT", "pt-AO"}, {"pt-PT", "pt-CV"}, {"pt-PT", "pt-GW"}, @@ -397,9 +409,6 @@ var matchTests = []matchTest{ {"pt-PT", "pt-MZ"}, {"pt-PT", "pt-ST"}, {"pt-PT", "pt-TL"}, - // TODO for CLDR 24+ - // - en-001 - // - {"zh-Hant-HK", "zh-Hant-MO"}, }, }, // Options and variants are inherited from user-defined settings. diff --git a/language/language.go b/language/language.go index a2d037836..f1012c952 100644 --- a/language/language.go +++ b/language/language.go @@ -129,8 +129,15 @@ const ( // specific language or locale. All language tag values are guaranteed to be // well-formed. type Tag struct { - lang langID - region regionID + lang langID + region regionID + // TODO: we will soon run out of positions for script. Idea: instead of + // storing lang, region, and script codes, store only the compact index and + // have a lookup table from this code to its expansion. This greatly speeds + // up table lookup, speed up common variant cases. + // This will also immediately free up 3 extra bytes. Also, the pVariant + // field can now be moved to the lookup table, as the compact index uniquely + // determines the offset of a possible variant. script scriptID pVariant byte // offset in str, includes preceding '-' pExt uint16 // offset of first extension, includes preceding '-' diff --git a/language/match.go b/language/match.go index bb4fff24d..71f1258dc 100644 --- a/language/match.go +++ b/language/match.go @@ -42,17 +42,36 @@ func NewMatcher(t []Tag) Matcher { func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) { match, w, c := m.getBest(want...) - if match == nil { - t = m.default_.tag - } else { + if match != nil { t, index = match.tag, match.index + } else { + // TODO: this should be an option + t = m.default_.tag + outer: + for _, w := range want { + script, _ := w.Script() + if script.scriptID == 0 { + // Don't do anything if there is no script, such as with + // private subtags. + continue + } + for i, h := range m.supported { + if script.scriptID == h.maxScript { + t, index = h.tag, i + break outer + } + } + } + // TODO: select first language tag based on script. + } + if w.region != 0 && t.region != 0 && t.region.contains(w.region) { + t, _ = Raw.Compose(t, Region{w.region}) } // Copy options from the user-provided tag into the result tag. This is hard // to do after the fact, so we do it here. - // TODO: consider also adding in variants that are compatible with the - // matched language. - // TODO: Add back region if it is non-ambiguous? Or create another tag to - // preserve the region? + // TODO: add in alternative variants to -u-va-. + // TODO: add preferred region to -u-rg-. + // TODO: add other extensions. Merge with existing extensions. if u, ok := w.Extension('u'); ok { t, _ = Raw.Compose(t, u) } @@ -389,6 +408,7 @@ func minimizeTags(t Tag) (Tag, error) { // matcher keeps a set of supported language tags, indexed by language. type matcher struct { default_ *haveTag + supported []*haveTag index map[langID]*matchHeader passSettings bool } @@ -514,6 +534,7 @@ func newMatcher(supported []Tag) *matcher { for i, tag := range supported { pair, _ := makeHaveTag(tag, i) m.header(tag.lang).addIfNew(pair, true) + m.supported = append(m.supported, &pair) } m.default_ = m.header(supported[0].lang).exact[0] for i, tag := range supported { @@ -523,6 +544,9 @@ func newMatcher(supported []Tag) *matcher { } } + // TODO: include alt script. + // - don't replace regions, but allow regions to be made more specific. + // update is used to add indexes in the map for equivalent languages. // If force is true, the update will also apply to derived entries. To // avoid applying a "transitive closure", use false. @@ -648,11 +672,12 @@ type bestMatch struct { want Tag conf Confidence // Cached results from applying tie-breaking rules. - origLang bool - origReg bool - regDist uint8 - origScript bool - parentDist uint8 // 255 if have is not an ancestor of want tag. + origLang bool + origReg bool + regGroupDist uint8 + regDist uint8 + origScript bool + parentDist uint8 // 255 if have is not an ancestor of want tag. } // update updates the existing best match if the new pair is considered to be a @@ -706,6 +731,14 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion beaten = true } + regGroupDist := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang) + if !beaten && m.regGroupDist != regGroupDist { + if regGroupDist > m.regGroupDist { + return + } + beaten = true + } + // We prefer if the pre-maximized region was specified and identical. origReg := have.tag.region == tag.region && tag.region != 0 if !beaten && m.origReg != origReg { @@ -715,8 +748,22 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion beaten = true } - // Next we prefer smaller distances between regions, as defined by regionDist. - regDist := regionDist(have.maxRegion, maxRegion, tag.lang) + // TODO: remove the region distance rule. Region distance has been replaced + // by the region grouping rule. For now we leave it as it still seems to + // have a net positive effect when applied after the grouping rule. + // Possible solutions: + // - apply the primary locale rule first to effectively disable region + // region distance if groups are defined. + // - express the following errors in terms of grouping (if possible) + // - find another method of handling the following cases. + // maximization of legacy: find mo in + // "sr-Cyrl, sr-Latn, ro, ro-MD": have ro; want ro-MD (High) + // region distance French: find fr-US in + // "en, fr, fr-CA, fr-CH": have fr; want fr-CA (High) + + // Next we prefer smaller distances between regions, as defined by + // regionDist. + regDist := uint8(regionDistance(have.maxRegion, maxRegion)) if !beaten && m.regDist != regDist { if regDist > m.regDist { return @@ -734,6 +781,9 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion } // Finally we prefer tags which have a closer parent relationship. + // TODO: the parent relationship no longer seems necessary. It doesn't hurt + // to leave it in as the final tie-breaker, though, especially until the + // grouping data has further matured. parentDist := parentDistance(have.tag.region, tag) if !beaten && m.parentDist != parentDist { if parentDist > m.parentDist { @@ -750,6 +800,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion m.origLang = origLang m.origReg = origReg m.origScript = origScript + m.regGroupDist = regGroupDist m.regDist = regDist m.parentDist = parentDist } @@ -772,15 +823,27 @@ func parentDistance(haveRegion regionID, tag Tag) uint8 { return d } -// regionDist wraps regionDistance with some exceptions to the algorithmic distance. -func regionDist(a, b regionID, lang langID) uint8 { - if lang == _en { - // Two variants of non-US English are close to each other, regardless of distance. - if a != _US && b != _US { - return 2 +// regionGroupDist computes the distance between two regions based on their +// CLDR grouping. +func regionGroupDist(a, b regionID, script scriptID, lang langID) uint8 { + aGroup := uint(regionToGroups[a]) << 1 + bGroup := uint(regionToGroups[b]) << 1 + for _, ri := range matchRegion { + if langID(ri.lang) == lang && (ri.script == 0 || scriptID(ri.script) == script) { + group := uint(1 << (ri.group &^ 0x80)) + if 0x80&ri.group == 0 { + if aGroup&bGroup&group != 0 { // Both regions are in the group. + return ri.distance + } + } else { + if (aGroup|bGroup)&group == 0 { // Both regions are not in the group. + return ri.distance + } + } } } - return uint8(regionDistance(a, b)) + const defaultDistance = 4 + return defaultDistance } // regionDistance computes the distance between two regions based on the diff --git a/language/match_test.go b/language/match_test.go index 26cc2af2b..cbb847cdd 100644 --- a/language/match_test.go +++ b/language/match_test.go @@ -8,14 +8,109 @@ import ( "bytes" "flag" "fmt" + "os" + "path" "strings" "testing" "golang.org/x/text/internal/testtext" + "golang.org/x/text/internal/ucd" ) var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers") +func TestCLDRCompliance(t *testing.T) { + r, err := os.Open("testdata/localeMatcherTest.txt") + if err != nil { + t.Fatal(err) + } + ucd.Parse(r, func(p *ucd.Parser) { + name := strings.Replace(path.Join(p.String(0), p.String(1)), " ", "", -1) + if skip[name] { + return + } + t.Run(name, func(t *testing.T) { + supported := makeTagList(p.String(0)) + desired := makeTagList(p.String(1)) + gotCombined, index, _ := NewMatcher(supported).Match(desired...) + + gotMatch := supported[index] + wantMatch := Make(p.String(2)) + if gotMatch != wantMatch { + t.Fatalf("match: got %q; want %q", gotMatch, wantMatch) + } + wantCombined, err := Parse(p.String(3)) + if err == nil && gotCombined != wantCombined { + t.Errorf("combined: got %q; want %q", gotCombined, wantCombined) + } + }) + }) +} + +var skip = map[string]bool{ + // TODO: bugs + // und- is not expanded to the appropriate language. + "en-Hant-TW,und-TW/zh-Hant": true, // match: got "en-Hant-TW"; want "und-TW" + "en-Hant-TW,und-TW/zh": true, // match: got "en-Hant-TW"; want "und-TW" + // Honor the wildcard match. This may only be useful to select non-exact + // stuff. + "mul,af/nl": true, // match: got "af"; want "mul" + + // TODO: include other extensions. + // combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab" + "und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true, + + // Inconsistencies with Mark Davis' implementation where it is not clear + // which is better. + + // Go prefers exact matches over less exact preferred ones. + // Preferring desired ones might be better. + "en,de,fr,ja/de-CH,fr": true, // match: got "fr"; want "de" + "en-GB,en,de,fr,ja/de-CH,fr": true, // match: got "fr"; want "de" + "pt-PT,pt-BR,es,es-419/pt-US,pt-PT": true, // match: got "pt-PT"; want "pt-BR" + "pt-PT,pt,es,es-419/pt-US,pt-PT,pt": true, // match: got "pt-PT"; want "pt" + "en,sv/en-GB,sv": true, // match: got "sv"; want "en" + "en-NZ,en-IT/en-US": true, // match: got "en-IT"; want "en-NZ" + + // Inconsistencies in combined. I think the Go approach is more appropriate. + // We could use -u-rg- and -u-va- as alternative. + "und,fr/fr-BE-fonipa": true, // combined: got "fr"; want "fr-BE-fonipa" + "und,fr-CA/fr-BE-fonipa": true, // combined: got "fr-CA"; want "fr-BE-fonipa" + "und,fr-fonupa/fr-BE-fonipa": true, // combined: got "fr-fonupa"; want "fr-BE-fonipa" + "und,no/nn-BE-fonipa": true, // combined: got "no"; want "no-BE-fonipa" + "50,und,fr-CA-fonupa/fr-BE-fonipa": true, // combined: got "fr-CA-fonupa"; want "fr-BE-fonipa" + + // Spec says prefer primary locales. But what is the benefit? Shouldn't + // the developer just not specify the primary locale first in the list? + // TODO: consider adding a SortByPreferredLocale function to ensure tags + // are ordered such that the preferred locale rule is observed. + // TODO: most of these cases are solved by getting rid of the region + // distance tie-breaker rule (see comments there). + "und,es,es-MA,es-MX,es-419/es-EA": true, // match: got "es-MA"; want "es" + "und,es-MA,es,es-419,es-MX/es-EA": true, // match: got "es-MA"; want "es" + "und,en,en-GU,en-IN,en-GB/en-ZA": true, // match: got "en-IN"; want "en-GB" + "und,en,en-GU,en-IN,en-GB/en-VI": true, // match: got "en-GU"; want "en" + "und,en-GU,en,en-GB,en-IN/en-VI": true, // match: got "en-GU"; want "en" + + // Falling back to the default seems more appropriate than falling back + // on a language with the same script. + "50,und,fr-Cyrl-CA-fonupa/fr-BE-fonipa": true, + // match: got "und"; want "fr-Cyrl-CA-fonupa" + // combined: got "und"; want "fr-Cyrl-BE-fonipa" + + // Other interesting cases to test: + // - Should same language or same script have the preference if there is + // usually no understanding of the other script? + // - More specific region in desired may replace enclosing supported. +} + +func makeTagList(s string) (tags []Tag) { + for _, s := range strings.Split(s, ",") { + tags = append(tags, Make(strings.TrimSpace(s))) + } + return tags +} + func TestAddLikelySubtags(t *testing.T) { tests := []struct{ in, out string }{ {"aa", "aa-Latn-ET"}, @@ -164,6 +259,31 @@ func TestMinimize(t *testing.T) { } } +func TestRegionGroups(t *testing.T) { + testCases := []struct { + a, b string + distance uint8 + }{ + {"zh-TW", "zh-HK", 5}, + {"zh-MO", "zh-HK", 4}, + } + for _, tc := range testCases { + a := MustParse(tc.a) + aScript, _ := a.Script() + b := MustParse(tc.b) + bScript, _ := b.Script() + + if aScript != bScript { + t.Errorf("scripts differ: %q vs %q", aScript, bScript) + continue + } + d := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang) + if d != tc.distance { + t.Errorf("got %q; want %q", d, tc.distance) + } + } +} + func TestRegionDistance(t *testing.T) { tests := []struct { a, b string @@ -259,17 +379,20 @@ func parseSupported(list string) (out []Tag) { // The test set for TestBestMatch is defined in data_test.go. func TestBestMatch(t *testing.T) { - for i, tt := range matchTests { + for _, tt := range matchTests { supported := parseSupported(tt.supported) m := newMatcher(supported) if *verbose { fmt.Printf("%s:\n%v\n", tt.comment, m) } for _, tm := range tt.test { - tag, _, conf := m.Match(parseSupported(tm.desired)...) - if tag.String() != tm.match { - t.Errorf("%d:%s: find %s in %q: have %s; want %s (%v)\n", i, tt.comment, tm.desired, tt.supported, tag, tm.match, conf) - } + t.Run(path.Join(tt.comment, tt.supported, tm.desired), func(t *testing.T) { + tag, _, conf := m.Match(parseSupported(tm.desired)...) + if tag.String() != tm.match { + t.Errorf("find %s in %q: have %s; want %s (%v)", tm.desired, tt.supported, tag, tm.match, conf) + } + }) + } } } diff --git a/language/testdata/localeMatcherTest.txt b/language/testdata/localeMatcherTest.txt new file mode 100644 index 000000000..6568f2d3c --- /dev/null +++ b/language/testdata/localeMatcherTest.txt @@ -0,0 +1,389 @@ +# TODO: this file has not yet been included in the main CLDR release. +# The intent is to verify this file against the Go implementation and then +# correct the cases and add merge in other interesting test cases. +# See TestCLDRCompliance in match_test.go, as well as the list of exceptions +# defined in the map skip below it, for the work in progress. + +# Data-driven test for the XLocaleMatcher. +# Format +# • Everything after "#" is a comment +# • Arguments are separated by ";". They are: + +# supported ; desired ; expected + +# • The supported may have the threshold distance reset as a first item, eg 50, en, fr +# A line starting with @debug will reach a statement in the test code where you can put a breakpoint for debugging +# The test code also supports reformatting this file, by setting the REFORMAT flag. + +################################################## +# testParentLocales + +# es-419, es-AR, and es-MX are in a cluster; es is in a different one + +es-419, es-ES ; es-AR ; es-419 +es-ES, es-419 ; es-AR ; es-419 + +es-419, es ; es-AR ; es-419 +es, es-419 ; es-AR ; es-419 + +es-MX, es ; es-AR ; es-MX +es, es-MX ; es-AR ; es-MX + +# en-GB, en-AU, and en-NZ are in a cluster; en in a different one + +en-GB, en-US ; en-AU ; en-GB +en-US, en-GB ; en-AU ; en-GB + +en-GB, en ; en-AU ; en-GB +en, en-GB ; en-AU ; en-GB + +en-NZ, en-US ; en-AU ; en-NZ +en-US, en-NZ ; en-AU ; en-NZ + +en-NZ, en ; en-AU ; en-NZ +en, en-NZ ; en-AU ; en-NZ + +# pt-AU and pt-PT in one cluster; pt-BR in another + +pt-PT, pt-BR ; pt-AO ; pt-PT +pt-BR, pt-PT ; pt-AO ; pt-PT + +pt-PT, pt ; pt-AO ; pt-PT +pt, pt-PT ; pt-AO ; pt-PT + +zh-MO, zh-TW ; zh-HK ; zh-MO +zh-TW, zh-MO ; zh-HK ; zh-MO + +zh-MO, zh-TW ; zh-HK ; zh-MO +zh-TW, zh-MO ; zh-HK ; zh-MO + +zh-MO, zh-CN ; zh-HK ; zh-MO +zh-CN, zh-MO ; zh-HK ; zh-MO + +zh-MO, zh ; zh-HK ; zh-MO +zh, zh-MO ; zh-HK ; zh-MO + +################################################## +# testChinese + +zh-CN, zh-TW, iw ; zh-Hant-TW ; zh-TW +zh-CN, zh-TW, iw ; zh-Hant ; zh-TW +zh-CN, zh-TW, iw ; zh-TW ; zh-TW +zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN +zh-CN, zh-TW, iw ; zh-CN ; zh-CN +zh-CN, zh-TW, iw ; zh ; zh-CN + +################################################## +# testenGB + +fr, en, en-GB, es-419, es-MX, es ; en-NZ ; en-GB +fr, en, en-GB, es-419, es-MX, es ; es-ES ; es +fr, en, en-GB, es-419, es-MX, es ; es-AR ; es-419 +fr, en, en-GB, es-419, es-MX, es ; es-MX ; es-MX + +################################################## +# testFallbacks + +91, en, hi ; sa ; hi + +################################################## +# testBasics + +fr, en-GB, en ; en-GB ; en-GB +fr, en-GB, en ; en ; en +fr, en-GB, en ; fr ; fr +fr, en-GB, en ; ja ; fr # return first if no match + +################################################## +# testFallback + +# check that script fallbacks are handled right + +zh-CN, zh-TW, iw ; zh-Hant ; zh-TW +zh-CN, zh-TW, iw ; zh ; zh-CN +zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN +zh-CN, zh-TW, iw ; zh-Hant-HK ; zh-TW +zh-CN, zh-TW, iw ; he-IT ; iw + +################################################## +# testSpecials + +# check that nearby languages are handled + +en, fil, ro, nn ; tl ; fil +en, fil, ro, nn ; mo ; ro +en, fil, ro, nn ; nb ; nn + +# make sure default works + +en, fil, ro, nn ; ja ; en + +################################################## +# testRegionalSpecials + +# verify that en-AU is closer to en-GB than to en (which is en-US) + +en, en-GB, es, es-419 ; es-MX ; es-419 +en, en-GB, es, es-419 ; en-AU ; en-GB +en, en-GB, es, es-419 ; es-ES ; es + +################################################## +# testHK + +# HK and MO are closer to each other for Hant than to TW + +zh, zh-TW, zh-MO ; zh-HK ; zh-MO +zh, zh-TW, zh-HK ; zh-MO ; zh-HK + +################################################## +# testMatch-exact + +# see localeDistance.txt + +################################################## +# testMatch-none + +# see localeDistance.txt + +################################################## +# testMatch-matchOnMazimized + +zh, zh-Hant ; und-TW ; zh-Hant # und-TW should be closer to zh-Hant than to zh +en-Hant-TW, und-TW ; zh-Hant ; und-TW # zh-Hant should be closer to und-TW than to en-Hant-TW +en-Hant-TW, und-TW ; zh ; und-TW # zh should be closer to und-TW than to en-Hant-TW + +################################################## +# testMatchGrandfatheredCode + +fr, i-klingon, en-Latn-US ; en-GB-oed ; en-Latn-US + +################################################## +# testGetBestMatchForList-exactMatch +fr, en-GB, ja, es-ES, es-MX ; ja, de ; ja + +################################################## +# testGetBestMatchForList-simpleVariantMatch +fr, en-GB, ja, es-ES, es-MX ; de, en-US ; en-GB # Intentionally avoiding a perfect-match or two candidates for variant matches. + +# Fallback. + +fr, en-GB, ja, es-ES, es-MX ; de, zh ; fr + +################################################## +# testGetBestMatchForList-matchOnMaximized +# Check that if the preference is maximized already, it works as well. + +en, ja ; ja-Jpan-JP, en-AU ; ja # Match for ja-Jpan-JP (maximized already) + +# ja-JP matches ja on likely subtags, and it's listed first, thus it wins over the second preference en-GB. + +en, ja ; ja-JP, en-US ; ja # Match for ja-Jpan-JP (maximized already) + +# Check that if the preference is maximized already, it works as well. + +en, ja ; ja-Jpan-JP, en-US ; ja # Match for ja-Jpan-JP (maximized already) + +################################################## +# testGetBestMatchForList-noMatchOnMaximized +# Regression test for http://b/5714572 . +# de maximizes to de-DE. Pick the exact match for the secondary language instead. +en, de, fr, ja ; de-CH, fr ; de + +################################################## +# testBestMatchForTraditionalChinese + +# Scenario: An application that only supports Simplified Chinese (and some other languages), +# but does not support Traditional Chinese. zh-Hans-CN could be replaced with zh-CN, zh, or +# zh-Hans, it wouldn't make much of a difference. + +# The script distance (simplified vs. traditional Han) is considered small enough +# to be an acceptable match. The regional difference is considered almost insignificant. + +fr, zh-Hans-CN, en-US ; zh-TW ; zh-Hans-CN +fr, zh-Hans-CN, en-US ; zh-Hant ; zh-Hans-CN + +# For geo-political reasons, you might want to avoid a zh-Hant -> zh-Hans match. +# In this case, if zh-TW, zh-HK or a tag starting with zh-Hant is requested, you can +# change your call to getBestMatch to include a 2nd language preference. +# "en" is a better match since its distance to "en-US" is closer than the distance +# from "zh-TW" to "zh-CN" (script distance). + +fr, zh-Hans-CN, en-US ; zh-TW, en ; en-US +fr, zh-Hans-CN, en-US ; zh-Hant-CN, en, en ; en-US +fr, zh-Hans-CN, en-US ; zh-Hans, en ; zh-Hans-CN + +################################################## +# testUndefined +# When the undefined language doesn't match anything in the list, +# getBestMatch returns the default, as usual. + +it, fr ; und ; it + +# When it *does* occur in the list, bestMatch returns it, as expected. +it, und ; und ; und + +# The unusual part: max("und") = "en-Latn-US", and since matching is based on maximized +# tags, the undefined language would normally match English. But that would produce the +# counterintuitive results that getBestMatch("und", XLocaleMatcher("it,en")) would be "en", and +# getBestMatch("en", XLocaleMatcher("it,und")) would be "und". + +# To avoid that, we change the matcher's definitions of max +# so that max("und")="und". That produces the following, more desirable +# results: + +it, en ; und ; it +it, und ; en ; it + +################################################## +# testGetBestMatch-regionDistance + +es-AR, es ; es-MX ; es-AR +fr, en, en-GB ; en-CA ; en-GB +de-AT, de-DE, de-CH ; de ; de-DE + +################################################## +# testAsymmetry + +mul, nl ; af ; nl # af => nl +mul, af ; nl ; mul # but nl !=> af + +################################################## +# testGetBestMatchForList-matchOnMaximized2 + +# ja-JP matches ja on likely subtags, and it's listed first, thus it wins over the second preference en-GB. + +fr, en-GB, ja, es-ES, es-MX ; ja-JP, en-GB ; ja # Match for ja-JP, with likely region subtag + +# Check that if the preference is maximized already, it works as well. + +fr, en-GB, ja, es-ES, es-MX ; ja-Jpan-JP, en-GB ; ja # Match for ja-Jpan-JP (maximized already) + +################################################## +# testGetBestMatchForList-closeEnoughMatchOnMaximized + +en-GB, en, de, fr, ja ; de-CH, fr ; de +en-GB, en, de, fr, ja ; en-US, ar, nl, de, ja ; en + +################################################## +# testGetBestMatchForPortuguese + +# pt might be supported and not pt-PT + +# European user who prefers Spanish over Brazillian Portuguese as a fallback. + +pt-PT, pt-BR, es, es-419 ; pt-PT, es, pt ; pt-PT +pt-PT, pt, es, es-419 ; pt-PT, es, pt ; pt-PT # pt implicit + +# Brazillian user who prefers South American Spanish over European Portuguese as a fallback. +# The asymmetry between this case and above is because it's "pt-PT" that's missing between the +# matchers as "pt-BR" is a much more common language. + +pt-PT, pt-BR, es, es-419 ; pt, es-419, pt-PT ; pt-BR +pt-PT, pt-BR, es, es-419 ; pt-PT, es, pt ; pt-PT +pt-PT, pt, es, es-419 ; pt-PT, es, pt ; pt-PT +pt-PT, pt, es, es-419 ; pt, es-419, pt-PT ; pt + +pt-BR, es, es-419 ; pt, es-419, pt-PT ; pt-BR + +# Code that adds the user's country can get "pt-US" for a user's language. +# That should fall back to "pt-BR". + +pt-PT, pt-BR, es, es-419 ; pt-US, pt-PT ; pt-BR +pt-PT, pt, es, es-419 ; pt-US, pt-PT, pt ; pt # pt-BR implicit + +################################################## +# testVariantWithScriptMatch 1 and 2 + +fr, en, sv ; en-GB ; en +fr, en, sv ; en-GB ; en +en, sv ; en-GB, sv ; en + +################################################## +# testLongLists + +en, sv ; sv ; sv +af, am, ar, az, be, bg, bn, bs, ca, cs, cy, cy, da, de, el, en, en-GB, es, es-419, et, eu, fa, fi, fil, fr, ga, gl, gu, hi, hr, hu, hy, id, is, it, iw, ja, ka, kk, km, kn, ko, ky, lo, lt, lv, mk, ml, mn, mr, ms, my, ne, nl, no, pa, pl, pt, pt-PT, ro, ru, si, sk, sl, sq, sr, sr-Latn, sv, sw, ta, te, th, tr, uk, ur, uz, vi, zh-CN, zh-TW, zu ; sv ; sv +af, af-NA, af-ZA, agq, agq-CM, ak, ak-GH, am, am-ET, ar, ar-001, ar-AE, ar-BH, ar-DJ, ar-DZ, ar-EG, ar-EH, ar-ER, ar-IL, ar-IQ, ar-JO, ar-KM, ar-KW, ar-LB, ar-LY, ar-MA, ar-MR, ar-OM, ar-PS, ar-QA, ar-SA, ar-SD, ar-SO, ar-SS, ar-SY, ar-TD, ar-TN, ar-YE, as, as-IN, asa, asa-TZ, ast, ast-ES, az, az-Cyrl, az-Cyrl-AZ, az-Latn, az-Latn-AZ, bas, bas-CM, be, be-BY, bem, bem-ZM, bez, bez-TZ, bg, bg-BG, bm, bm-ML, bn, bn-BD, bn-IN, bo, bo-CN, bo-IN, br, br-FR, brx, brx-IN, bs, bs-Cyrl, bs-Cyrl-BA, bs-Latn, bs-Latn-BA, ca, ca-AD, ca-ES, ca-ES-VALENCIA, ca-FR, ca-IT, ce, ce-RU, cgg, cgg-UG, chr, chr-US, ckb, ckb-IQ, ckb-IR, cs, cs-CZ, cu, cu-RU, cy, cy-GB, da, da-DK, da-GL, dav, dav-KE, de, de-AT, de-BE, de-CH, de-DE, de-LI, de-LU, dje, dje-NE, dsb, dsb-DE, dua, dua-CM, dyo, dyo-SN, dz, dz-BT, ebu, ebu-KE, ee, ee-GH, ee-TG, el, el-CY, el-GR, en, en-001, en-150, en-AG, en-AI, en-AS, en-AT, en-AU, en-BB, en-BE, en-BI, en-BM, en-BS, en-BW, en-BZ, en-CA, en-CC, en-CH, en-CK, en-CM, en-CX, en-CY, en-DE, en-DG, en-DK, en-DM, en-ER, en-FI, en-FJ, en-FK, en-FM, en-GB, en-GD, en-GG, en-GH, en-GI, en-GM, en-GU, en-GY, en-HK, en-IE, en-IL, en-IM, en-IN, en-IO, en-JE, en-JM, en-KE, en-KI, en-KN, en-KY, en-LC, en-LR, en-LS, en-MG, en-MH, en-MO, en-MP, en-MS, en-MT, en-MU, en-MW, en-MY, en-NA, en-NF, en-NG, en-NL, en-NR, en-NU, en-NZ, en-PG, en-PH, en-PK, en-PN, en-PR, en-PW, en-RW, en-SB, en-SC, en-SD, en-SE, en-SG, en-SH, en-SI, en-SL, en-SS, en-SX, en-SZ, en-TC, en-TK, en-TO, en-TT, en-TV, en-TZ, en-UG, en-UM, en-US, en-US-POSIX, en-VC, en-VG, en-VI, en-VU, en-WS, en-ZA, en-ZM, en-ZW, eo, eo-001, es, es-419, es-AR, es-BO, es-CL, es-CO, es-CR, es-CU, es-DO, es-EA, es-EC, es-ES, es-GQ, es-GT, es-HN, es-IC, es-MX, es-NI, es-PA, es-PE, es-PH, es-PR, es-PY, es-SV, es-US, es-UY, es-VE, et, et-EE, eu, eu-ES, ewo, ewo-CM, fa, fa-AF, fa-IR, ff, ff-CM, ff-GN, ff-MR, ff-SN, fi, fi-FI, fil, fil-PH, fo, fo-DK, fo-FO, fr, fr-BE, fr-BF, fr-BI, fr-BJ, fr-BL, fr-CA, fr-CD, fr-CF, fr-CG, fr-CH, fr-CI, fr-CM, fr-DJ, fr-DZ, fr-FR, fr-GA, fr-GF, fr-GN, fr-GP, fr-GQ, fr-HT, fr-KM, fr-LU, fr-MA, fr-MC, fr-MF, fr-MG, fr-ML, fr-MQ, fr-MR, fr-MU, fr-NC, fr-NE, fr-PF, fr-PM, fr-RE, fr-RW, fr-SC, fr-SN, fr-SY, fr-TD, fr-TG, fr-TN, fr-VU, fr-WF, fr-YT, fur, fur-IT, fy, fy-NL, ga, ga-IE, gd, gd-GB, gl, gl-ES, gsw, gsw-CH, gsw-FR, gsw-LI, gu, gu-IN, guz, guz-KE, gv, gv-IM, ha, ha-GH, ha-NE, ha-NG, haw, haw-US, he, he-IL, hi, hi-IN, hr, hr-BA, hr-HR, hsb, hsb-DE, hu, hu-HU, hy, hy-AM, id, id-ID, ig, ig-NG, ii, ii-CN, is, is-IS, it, it-CH, it-IT, it-SM, ja, ja-JP, jgo, jgo-CM, jmc, jmc-TZ, ka, ka-GE, kab, kab-DZ, kam, kam-KE, kde, kde-TZ, kea, kea-CV, khq, khq-ML, ki, ki-KE, kk, kk-KZ, kkj, kkj-CM, kl, kl-GL, kln, kln-KE, km, km-KH, kn, kn-IN, ko, ko-KP, ko-KR, kok, kok-IN, ks, ks-IN, ksb, ksb-TZ, ksf, ksf-CM, ksh, ksh-DE, kw, kw-GB, ky, ky-KG, lag, lag-TZ, lb, lb-LU, lg, lg-UG, lkt, lkt-US, ln, ln-AO, ln-CD, ln-CF, ln-CG, lo, lo-LA, lrc, lrc-IQ, lrc-IR, lt, lt-LT, lu, lu-CD, luo, luo-KE, luy, luy-KE, lv, lv-LV, mas, mas-KE, mas-TZ, mer, mer-KE, mfe, mfe-MU, mg, mg-MG, mgh, mgh-MZ, mgo, mgo-CM, mk, mk-MK, ml, ml-IN, mn, mn-MN, mr, mr-IN, ms, ms-BN, ms-MY, ms-SG, mt, mt-MT, mua, mua-CM, my, my-MM, mzn, mzn-IR, naq, naq-NA, nb, nb-NO, nb-SJ, nd, nd-ZW, ne, ne-IN, ne-NP, nl, nl-AW, nl-BE, nl-BQ, nl-CW, nl-NL, nl-SR, nl-SX, nmg, nmg-CM, nn, nn-NO, nnh, nnh-CM, nus, nus-SS, nyn, nyn-UG, om, om-ET, om-KE, or, or-IN, os, os-GE, os-RU, pa, pa-Arab, pa-Arab-PK, pa-Guru, pa-Guru-IN, pl, pl-PL, prg, prg-001, ps, ps-AF, pt, pt-AO, pt-BR, pt-CV, pt-GW, pt-MO, pt-MZ, pt-PT, pt-ST, pt-TL, qu, qu-BO, qu-EC, qu-PE, rm, rm-CH, rn, rn-BI, ro, ro-MD, ro-RO, rof, rof-TZ, root, ru, ru-BY, ru-KG, ru-KZ, ru-MD, ru-RU, ru-UA, rw, rw-RW, rwk, rwk-TZ, sah, sah-RU, saq, saq-KE, sbp, sbp-TZ, se, se-FI, se-NO, se-SE, seh, seh-MZ, ses, ses-ML, sg, sg-CF, shi, shi-Latn, shi-Latn-MA, shi-Tfng, shi-Tfng-MA, si, si-LK, sk, sk-SK, sl, sl-SI, smn, smn-FI, sn, sn-ZW, so, so-DJ, so-ET, so-KE, so-SO, sq, sq-AL, sq-MK, sq-XK, sr, sr-Cyrl, sr-Cyrl-BA, sr-Cyrl-ME, sr-Cyrl-RS, sr-Cyrl-XK, sr-Latn, sr-Latn-BA, sr-Latn-ME, sr-Latn-RS, sr-Latn-XK, sv, sv-AX, sv-FI, sv-SE, sw, sw-CD, sw-KE, sw-TZ, sw-UG, ta, ta-IN, ta-LK, ta-MY, ta-SG, te, te-IN, teo, teo-KE, teo-UG, th, th-TH, ti, ti-ER, ti-ET, tk, tk-TM, to, to-TO, tr, tr-CY, tr-TR, twq, twq-NE, tzm, tzm-MA, ug, ug-CN, uk, uk-UA, ur, ur-IN, ur-PK, uz, uz-Arab, uz-Arab-AF, uz-Cyrl, uz-Cyrl-UZ, uz-Latn, uz-Latn-UZ, vai, vai-Latn, vai-Latn-LR, vai-Vaii, vai-Vaii-LR, vi, vi-VN, vo, vo-001, vun, vun-TZ, wae, wae-CH, xog, xog-UG, yav, yav-CM, yi, yi-001, yo, yo-BJ, yo-NG, zgh, zgh-MA, zh, zh-Hans, zh-Hans-CN, zh-Hans-HK, zh-Hans-MO, zh-Hans-SG, zh-Hant, zh-Hant-HK, zh-Hant-MO, zh-Hant-TW, zu, zu-ZA ; sv ; sv + +################################################## +# test8288 + +it, en ; und ; it +it, en ; und, en ; en + +# examples from +# http://unicode.org/repos/cldr/tags/latest/common/bcp47/ +# http://unicode.org/repos/cldr/tags/latest/common/validity/variant.xml + +################################################## +# testUnHack + +en-NZ, en-IT ; en-US ; en-NZ + +################################################## +# testEmptySupported => null + ; en ; null + +################################################## +# testVariantsAndExtensions +################################################## +# tests the .combine() method + +und, fr ; fr-BE-fonipa ; fr ; fr-BE-fonipa +und, fr-CA ; fr-BE-fonipa ; fr-CA ; fr-BE-fonipa +und, fr-fonupa ; fr-BE-fonipa ; fr-fonupa ; fr-BE-fonipa +und, no ; nn-BE-fonipa ; no ; no-BE-fonipa +und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin + +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK + +################################################## +# testClusters +# we favor es-419 over others in cluster. Clusters: es- {ES, MA, EA} {419, AR, MX} + +und, es, es-MA, es-MX, es-419 ; es-AR ; es-419 +und, es-MA, es, es-419, es-MX ; es-AR ; es-419 +und, es, es-MA, es-MX, es-419 ; es-EA ; es +und, es-MA, es, es-419, es-MX ; es-EA ; es + +# of course, fall back to within cluster + +und, es, es-MA, es-MX ; es-AR ; es-MX +und, es-MA, es, es-MX ; es-AR ; es-MX +und, es-MA, es-MX, es-419 ; es-EA ; es-MA +und, es-MA, es-419, es-MX ; es-EA ; es-MA + +# we favor es-GB over others in cluster. Clusters: en- {US, GU, VI} {GB, IN, ZA} + +und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB +und, en-GU, en, en-GB, en-IN ; en-ZA ; en-GB +und, en, en-GU, en-IN, en-GB ; en-VI ; en +und, en-GU, en, en-GB, en-IN ; en-VI ; en + +# of course, fall back to within cluster + +und, en, en-GU, en-IN ; en-ZA ; en-IN +und, en-GU, en, en-IN ; en-ZA ; en-IN +und, en-GU, en-IN, en-GB ; en-VI ; en-GU +und, en-GU, en-GB, en-IN ; en-VI ; en-GU + +################################################## +# testThreshold +@Threshold=60 + +50, und, fr-CA-fonupa ; fr-BE-fonipa ; fr-CA-fonupa ; fr-BE-fonipa +50, und, fr-Cyrl-CA-fonupa ; fr-BE-fonipa ; fr-Cyrl-CA-fonupa ; fr-Cyrl-BE-fonipa + +@Threshold=-1 # restore + +################################################## +# testScriptFirst +@DistanceOption=SCRIPT_FIRST +@debug + +ru, fr ; zh, pl ; fr +ru, fr ; zh-Cyrl, pl ; ru +hr, en-Cyrl; sr ; en-Cyrl +da, ru, hr; sr ; ru \ No newline at end of file