Skip to content

Commit

Permalink
language: update matching algorithm
Browse files Browse the repository at this point in the history
Using the new data:
- region grouping data to ultimately replace
  region distance
- allow making regions more specific if it falls
  within an enclosing region
- added CLDR test file (work in progress)
- added same-script rule

Change-Id: Ib2f279aefec871d9a0c13c105749623a93bb911a
Reviewed-on: https://go-review.googlesource.com/47346
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
  • Loading branch information
mpvl committed Aug 10, 2017
1 parent 3bd178b commit f6122dd
Show file tree
Hide file tree
Showing 7 changed files with 650 additions and 56 deletions.
5 changes: 4 additions & 1 deletion encoding/htmlindex/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,10 @@ var consts = map[string]string{
// locales is taken from
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
var locales = []struct{ tag, name string }{
{"und", "windows-1252"}, // The default value.
// The default value. Explicitly state latin to benefit from the exact
// script option, while still making 1252 the default encoding for languages
// written in Latin script.
{"und_Latn", "windows-1252"},
{"ar", "windows-1256"},
{"ba", "windows-1251"},
{"be", "windows-1251"},
Expand Down
4 changes: 2 additions & 2 deletions encoding/htmlindex/tables.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 34 additions & 25 deletions language/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ var matchTests = []matchTest{
{"en", "sh"},
{"en", "hr"},
{"en", "bs"},
{"en", "nl-Cyrl"},
// TODO: consider if the following match is a good one.
// Due to new script first rule, which maybe should be an option.
{"sr", "nl-Cyrl"},
},
},
{
Expand Down Expand Up @@ -228,6 +230,14 @@ var matchTests = []matchTest{
{"nl-NL", "nl-NL"},
},
},
{
"region may replace matched if matched is enclosing",
"es-419,es",
[]struct{ match, desired string }{
{"es-MX", "es-MX"},
{"es", "es-SG"},
},
},
{
"more specific region wins over more specific script",
"nl, nl-Latn, nl-NL, nl-BE",
Expand Down Expand Up @@ -265,7 +275,7 @@ var matchTests = []matchTest{
"en, en-GB, es-ES, es-419",
[]struct{ match, desired string }{
{"en-GB", "en-AU"},
{"es-419", "es-MX"},
{"es-MX", "es-MX"},
{"es-ES", "es-PT"},
},
},
Expand Down Expand Up @@ -355,6 +365,7 @@ var matchTests = []matchTest{
"en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK",
[]struct{ match, desired string }{
{"en-GB", "en-150"},
// {"en-GB", "en-001"}, // TODO: currently en, should probably be en-GB
{"en-GB", "en-AU"},
{"en-GB", "en-BE"},
{"en-GB", "en-GG"},
Expand All @@ -370,36 +381,34 @@ var matchTests = []matchTest{
{"en-GB", "en-SG"},
{"en-GB", "en-DE"},
{"en-GB", "en-MT"},
{"es-419", "es-AR"},
{"es-419", "es-BO"},
{"es-419", "es-CL"},
{"es-419", "es-CO"},
{"es-419", "es-CR"},
{"es-419", "es-CU"},
{"es-419", "es-DO"},
{"es-419", "es-EC"},
{"es-419", "es-GT"},
{"es-419", "es-HN"},
{"es-419", "es-MX"},
{"es-419", "es-NI"},
{"es-419", "es-PA"},
{"es-419", "es-PE"},
{"es-419", "es-PR"},
{"es-419", "es-PY"},
{"es-419", "es-SV"},
{"es-419", "es-US"},
{"es-419", "es-UY"},
{"es-419", "es-VE"},
{"es-AR", "es-AR"},
{"es-BO", "es-BO"},
{"es-CL", "es-CL"},
{"es-CO", "es-CO"},
{"es-CR", "es-CR"},
{"es-CU", "es-CU"},
{"es-DO", "es-DO"},
{"es-EC", "es-EC"},
{"es-GT", "es-GT"},
{"es-HN", "es-HN"},
{"es-MX", "es-MX"},
{"es-NI", "es-NI"},
{"es-PA", "es-PA"},
{"es-PE", "es-PE"},
{"es-PR", "es-PR"},
{"es", "es-PT"},
{"es-PY", "es-PY"},
{"es-SV", "es-SV"},
{"es-419", "es-US"}, // US is not in Latin America, so don't make more specific.
{"es-UY", "es-UY"},
{"es-VE", "es-VE"},
{"pt-PT", "pt-AO"},
{"pt-PT", "pt-CV"},
{"pt-PT", "pt-GW"},
{"pt-PT", "pt-MO"},
{"pt-PT", "pt-MZ"},
{"pt-PT", "pt-ST"},
{"pt-PT", "pt-TL"},
// TODO for CLDR 24+
// - en-001
// - {"zh-Hant-HK", "zh-Hant-MO"},
},
},
// Options and variants are inherited from user-defined settings.
Expand Down
11 changes: 9 additions & 2 deletions language/language.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,15 @@ const (
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag struct {
lang langID
region regionID
lang langID
region regionID
// TODO: we will soon run out of positions for script. Idea: instead of
// storing lang, region, and script codes, store only the compact index and
// have a lookup table from this code to its expansion. This greatly speeds
// up table lookup, speed up common variant cases.
// This will also immediately free up 3 extra bytes. Also, the pVariant
// field can now be moved to the lookup table, as the compact index uniquely
// determines the offset of a possible variant.
script scriptID
pVariant byte // offset in str, includes preceding '-'
pExt uint16 // offset of first extension, includes preceding '-'
Expand Down
105 changes: 84 additions & 21 deletions language/match.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,36 @@ func NewMatcher(t []Tag) Matcher {

func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
match, w, c := m.getBest(want...)
if match == nil {
t = m.default_.tag
} else {
if match != nil {
t, index = match.tag, match.index
} else {
// TODO: this should be an option
t = m.default_.tag
outer:
for _, w := range want {
script, _ := w.Script()
if script.scriptID == 0 {
// Don't do anything if there is no script, such as with
// private subtags.
continue
}
for i, h := range m.supported {
if script.scriptID == h.maxScript {
t, index = h.tag, i
break outer
}
}
}
// TODO: select first language tag based on script.
}
if w.region != 0 && t.region != 0 && t.region.contains(w.region) {
t, _ = Raw.Compose(t, Region{w.region})
}
// Copy options from the user-provided tag into the result tag. This is hard
// to do after the fact, so we do it here.
// TODO: consider also adding in variants that are compatible with the
// matched language.
// TODO: Add back region if it is non-ambiguous? Or create another tag to
// preserve the region?
// TODO: add in alternative variants to -u-va-.
// TODO: add preferred region to -u-rg-.
// TODO: add other extensions. Merge with existing extensions.
if u, ok := w.Extension('u'); ok {
t, _ = Raw.Compose(t, u)
}
Expand Down Expand Up @@ -389,6 +408,7 @@ func minimizeTags(t Tag) (Tag, error) {
// matcher keeps a set of supported language tags, indexed by language.
type matcher struct {
default_ *haveTag
supported []*haveTag
index map[langID]*matchHeader
passSettings bool
}
Expand Down Expand Up @@ -514,6 +534,7 @@ func newMatcher(supported []Tag) *matcher {
for i, tag := range supported {
pair, _ := makeHaveTag(tag, i)
m.header(tag.lang).addIfNew(pair, true)
m.supported = append(m.supported, &pair)
}
m.default_ = m.header(supported[0].lang).exact[0]
for i, tag := range supported {
Expand All @@ -523,6 +544,9 @@ func newMatcher(supported []Tag) *matcher {
}
}

// TODO: include alt script.
// - don't replace regions, but allow regions to be made more specific.

// update is used to add indexes in the map for equivalent languages.
// If force is true, the update will also apply to derived entries. To
// avoid applying a "transitive closure", use false.
Expand Down Expand Up @@ -648,11 +672,12 @@ type bestMatch struct {
want Tag
conf Confidence
// Cached results from applying tie-breaking rules.
origLang bool
origReg bool
regDist uint8
origScript bool
parentDist uint8 // 255 if have is not an ancestor of want tag.
origLang bool
origReg bool
regGroupDist uint8
regDist uint8
origScript bool
parentDist uint8 // 255 if have is not an ancestor of want tag.
}

// update updates the existing best match if the new pair is considered to be a
Expand Down Expand Up @@ -706,6 +731,14 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
beaten = true
}

regGroupDist := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
if !beaten && m.regGroupDist != regGroupDist {
if regGroupDist > m.regGroupDist {
return
}
beaten = true
}

// We prefer if the pre-maximized region was specified and identical.
origReg := have.tag.region == tag.region && tag.region != 0
if !beaten && m.origReg != origReg {
Expand All @@ -715,8 +748,22 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
beaten = true
}

// Next we prefer smaller distances between regions, as defined by regionDist.
regDist := regionDist(have.maxRegion, maxRegion, tag.lang)
// TODO: remove the region distance rule. Region distance has been replaced
// by the region grouping rule. For now we leave it as it still seems to
// have a net positive effect when applied after the grouping rule.
// Possible solutions:
// - apply the primary locale rule first to effectively disable region
// region distance if groups are defined.
// - express the following errors in terms of grouping (if possible)
// - find another method of handling the following cases.
// maximization of legacy: find mo in
// "sr-Cyrl, sr-Latn, ro, ro-MD": have ro; want ro-MD (High)
// region distance French: find fr-US in
// "en, fr, fr-CA, fr-CH": have fr; want fr-CA (High)

// Next we prefer smaller distances between regions, as defined by
// regionDist.
regDist := uint8(regionDistance(have.maxRegion, maxRegion))
if !beaten && m.regDist != regDist {
if regDist > m.regDist {
return
Expand All @@ -734,6 +781,9 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}

// Finally we prefer tags which have a closer parent relationship.
// TODO: the parent relationship no longer seems necessary. It doesn't hurt
// to leave it in as the final tie-breaker, though, especially until the
// grouping data has further matured.
parentDist := parentDistance(have.tag.region, tag)
if !beaten && m.parentDist != parentDist {
if parentDist > m.parentDist {
Expand All @@ -750,6 +800,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
m.origLang = origLang
m.origReg = origReg
m.origScript = origScript
m.regGroupDist = regGroupDist
m.regDist = regDist
m.parentDist = parentDist
}
Expand All @@ -772,15 +823,27 @@ func parentDistance(haveRegion regionID, tag Tag) uint8 {
return d
}

// regionDist wraps regionDistance with some exceptions to the algorithmic distance.
func regionDist(a, b regionID, lang langID) uint8 {
if lang == _en {
// Two variants of non-US English are close to each other, regardless of distance.
if a != _US && b != _US {
return 2
// regionGroupDist computes the distance between two regions based on their
// CLDR grouping.
func regionGroupDist(a, b regionID, script scriptID, lang langID) uint8 {
aGroup := uint(regionToGroups[a]) << 1
bGroup := uint(regionToGroups[b]) << 1
for _, ri := range matchRegion {
if langID(ri.lang) == lang && (ri.script == 0 || scriptID(ri.script) == script) {
group := uint(1 << (ri.group &^ 0x80))
if 0x80&ri.group == 0 {
if aGroup&bGroup&group != 0 { // Both regions are in the group.
return ri.distance
}
} else {
if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
return ri.distance
}
}
}
}
return uint8(regionDistance(a, b))
const defaultDistance = 4
return defaultDistance
}

// regionDistance computes the distance between two regions based on the
Expand Down
Loading

0 comments on commit f6122dd

Please sign in to comment.