Skip to content

Commit

Permalink
chain conversion (#715)
Browse files Browse the repository at this point in the history
  • Loading branch information
groverlynn authored Feb 9, 2024
1 parent 1ae7cd8 commit 5b0660f
Showing 1 changed file with 47 additions and 19 deletions.
66 changes: 47 additions & 19 deletions src/rime/gear/simplifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,30 @@ class Opencc {
opencc::Optional<const opencc::DictEntry*> item =
dict->Match(original_word);
if (item.IsNull()) {
// Current dictionary doesn't convert the word. We need to keep it for
// other dicts in the chain. e.g. s2t.json expands 里 to 里 and 裏,
// then t2tw.json passes 里 as-is and converts 裏 to 裡.
if (word_set.insert(original_word).second) {
converted_words.push_back(original_word);
// There is no exact match, but still need to convert partially
// matched in a chain conversion. Here apply default (max. seg.)
// match to get the most probable conversion result
std::ostringstream buffer;
for (const char* wstr = original_word.c_str(); *wstr != '\0';) {
opencc::Optional<const opencc::DictEntry*> matched =
dict->MatchPrefix(wstr);
size_t matched_length;
if (matched.IsNull()) {
matched_length = opencc::UTF8Util::NextCharLength(wstr);
buffer << opencc::UTF8Util::FromSubstr(wstr, matched_length);
} else {
matched_length = matched.Get()->KeyLength();
buffer << matched.Get()->GetDefault();
}
wstr += matched_length;
}
const string& converted_word = buffer.str();
// Even if current dictionary doesn't convert the word
// (converted_word == original_word), we still need to keep it for
// subsequent dicts in the chain. e.g. s2t.json expands 里 to 里 and
// 裏, then t2tw.json passes 里 as-is and converts 裏 to 裡.
if (word_set.insert(converted_word).second) {
converted_words.push_back(converted_word);
}
continue;
}
Expand All @@ -94,23 +113,32 @@ class Opencc {
bool RandomConvertText(const string& text, string* simplified) {
if (dict_ == nullptr)
return false;
const list<opencc::ConversionPtr> conversions =
converter_->GetConversionChain()->GetConversions();
const char* phrase = text.c_str();
std::ostringstream buffer;
for (const char* pstr = phrase; *pstr != '\0';) {
opencc::Optional<const opencc::DictEntry*> matched =
dict_->MatchPrefix(pstr);
size_t matchedLength;
if (matched.IsNull()) {
matchedLength = opencc::UTF8Util::NextCharLength(pstr);
buffer << opencc::UTF8Util::FromSubstr(pstr, matchedLength);
} else {
matchedLength = matched.Get()->KeyLength();
size_t i = rand() % (matched.Get()->NumValues());
buffer << matched.Get()->Values().at(i);
for (auto conversion : conversions) {
opencc::DictPtr dict = conversion->GetDict();
if (dict == nullptr) {
return false;
}
std::ostringstream buffer;
for (const char* pstr = phrase; *pstr != '\0';) {
opencc::Optional<const opencc::DictEntry*> matched =
dict->MatchPrefix(pstr);
size_t matched_length;
if (matched.IsNull()) {
matched_length = opencc::UTF8Util::NextCharLength(pstr);
buffer << opencc::UTF8Util::FromSubstr(pstr, matched_length);
} else {
matched_length = matched.Get()->KeyLength();
size_t i = rand() % (matched.Get()->NumValues());
buffer << matched.Get()->Values().at(i);
}
pstr += matched_length;
}
pstr += matchedLength;
*simplified = buffer.str();
phrase = simplified->c_str();
}
*simplified = buffer.str();
return *simplified != text;
}

Expand Down

0 comments on commit 5b0660f

Please sign in to comment.