Skip to content

Commit

Permalink
ICU-22100 Incorporate BudouX into ICU (Java)
Browse files Browse the repository at this point in the history
  • Loading branch information
allensu05 authored and markusicu committed Dec 20, 2022
1 parent 44480c4 commit 90caafb
Show file tree
Hide file tree
Showing 8 changed files with 554 additions and 7 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/icu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,38 @@ jobs:
[ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
if: ${{ failure() }}

# ICU4J build and unit test under adaboost
adaboost-icu4j-build-and-test:
runs-on: ubuntu-latest
steps:
- name: Checkout and setup
uses: actions/checkout@v2
with:
lfs: true
- name: Checkout lfs objects
run: git lfs pull
- uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '11'
- name: Config Adaboost and Rebuild data jar
run: |
cd icu4c/source;
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
make clean;
make -j2 ICU4J_ROOT=../../../icu4j icu4j-data-install;
cd ../..
- name: ICU4J
run: |
cd icu4j;
ant init;
ant -Dcom.ibm.icu.impl.breakiter.useMLPhraseBreaking=true check;
ant localespiCheck
- name: List failures (if any)
run: |
[ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
if: ${{ failure() }}

# gcc debug build.
# Includes dependency checker.
# Note - the dependency checker needs to be run on both a debug and an optimized build.
Expand Down
2 changes: 2 additions & 0 deletions icu4j/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,13 @@
<!--set the property - if it was set before it won't override-->
<property name="user-jvm-options" value=""/>
<property name="internal-jvm-options" value=""/>
<property name="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value=""/>
<delete dir="${junit.out.dir}/@{test-name}"/>
<mkdir dir="${junit.out.dir}/@{test-name}"/>

<junit fork="yes" forkmode="once" printsummary="yes" haltonfailure="no"
failureproperty="@{failure-status}" tempdir="${junit.out.dir}">
<sysproperty key="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value="${com.ibm.icu.impl.breakiter.useMLPhraseBreaking}" />
<jvmarg value="-Xss4m"/>
<jvmarg value="-ea"/>
<jvmarg value="-Djava.awt.headless=true"/>
Expand Down
6 changes: 6 additions & 0 deletions icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,9 @@ com.ibm.icu.impl.ICUResourceBundle.skipRuntimeLocaleResourceScan = false
# LocaleDisplayNames implementation class
# @internal
# com.ibm.icu.text.LocaleDisplayNames.impl = com.ibm.icu.impl.LocaleDisplayNamesImpl

#
# [Internal Use Only]
# Enable ML phrase breaking
# @internal
com.ibm.icu.impl.breakiter.useMLPhraseBreaking = false
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public String run() {
val = System.getProperty(name);
}

if (val == null) {
if (val == null || val.equals("")) {
val = CONFIG_PROPS.getProperty(name, def);
}
return val;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.HashSet;

import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUConfig;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
Expand All @@ -31,6 +32,8 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
private UnicodeSet fClosePunctuationSet;
private DictionaryMatcher fDictionary = null;
private HashSet<String> fSkipSet;
private MlBreakEngine fMlBreakEngine;
private boolean isCj = false;

public CjkBreakEngine(boolean korean) throws IOException {
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
Expand All @@ -47,9 +50,16 @@ public CjkBreakEngine(boolean korean) throws IOException {
if (korean) {
setCharacters(fHangulWordSet);
} else { //Chinese and Japanese
isCj = true;
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
setCharacters(cjSet);
initializeJapanesePhraseParamater();
if (Boolean.parseBoolean(
ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
fClosePunctuationSet);
} else {
initializeJapanesePhraseParamater();
}
}
}

Expand Down Expand Up @@ -151,6 +161,15 @@ public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int e
charPositions[numCodePts] = index;
}
}
// Use ML phrase breaking
if (Boolean.parseBoolean(
ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
if (isPhraseBreaking && isCj) {
return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text,
numCodePts, charPositions, foundBreaks);
}
}

// From here on out, do the algorithm. Note that our indices
// refer to indices within the normalized string.
Expand Down Expand Up @@ -276,10 +295,11 @@ public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int e
// In phrase breaking, there has to be a breakpoint between Cj character and close
// punctuation.
// E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
inText.setIndex(pos);
if (pos > previous) {
if (pos != startPos
|| (isPhraseBreaking && pos > 0
&& fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
&& fClosePunctuationSet.contains(previous32(inText)))) {
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
correctedNumBreaks++;
}
Expand All @@ -294,7 +314,9 @@ public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int e
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
if (isPhraseBreaking) {
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(inText.setIndex(endPos))) {
inText.setIndex(endPos);
int current = current32(inText);
if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
foundBreaks.pop();
correctedNumBreaks--;
}
Expand Down
Loading

0 comments on commit 90caafb

Please sign in to comment.