ICU-22100 Incorporate BudouX into ICU (Java)

See unicode-org#2214
chbxue · Dec 20, 2022 · 90caafb · 90caafb
1 parent 44480c4
commit 90caafb
Show file tree

Hide file tree

Showing 8 changed files with 554 additions and 7 deletions.
diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml
@@ -190,6 +190,38 @@ jobs:
           [ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
         if: ${{ failure() }}
 
+  # ICU4J build and unit test under adaboost
+  adaboost-icu4j-build-and-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout and setup
+        uses: actions/checkout@v2
+        with:
+          lfs: true
+      - name: Checkout lfs objects
+        run: git lfs pull
+      - uses: actions/setup-java@v3
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+      - name: Config Adaboost and Rebuild data jar
+        run: |
+          cd icu4c/source;
+          ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
+          make clean;
+          make -j2 ICU4J_ROOT=../../../icu4j icu4j-data-install;
+          cd ../..
+      - name: ICU4J
+        run: |
+          cd icu4j;
+          ant init;
+          ant -Dcom.ibm.icu.impl.breakiter.useMLPhraseBreaking=true check;
+          ant localespiCheck
+      - name: List failures (if any)
+        run: |
+          [ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
+        if: ${{ failure() }}
+
   # gcc debug build.
   # Includes dependency checker.
   # Note - the dependency checker needs to be run on both a debug and an optimized build.

diff --git a/icu4j/build.xml b/icu4j/build.xml
@@ -338,11 +338,13 @@
             <!--set the property - if it was set before it won't override-->
             <property name="user-jvm-options" value=""/>
             <property name="internal-jvm-options" value=""/>
+            <property name="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value=""/>
             <delete dir="${junit.out.dir}/@{test-name}"/>
             <mkdir  dir="${junit.out.dir}/@{test-name}"/>
 
             <junit fork="yes" forkmode="once" printsummary="yes" haltonfailure="no"
                 failureproperty="@{failure-status}" tempdir="${junit.out.dir}">
+                <sysproperty key="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value="${com.ibm.icu.impl.breakiter.useMLPhraseBreaking}" />
                 <jvmarg value="-Xss4m"/>
                 <jvmarg value="-ea"/>
                 <jvmarg value="-Djava.awt.headless=true"/>

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties b/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties
@@ -63,3 +63,9 @@ com.ibm.icu.impl.ICUResourceBundle.skipRuntimeLocaleResourceScan = false
 # LocaleDisplayNames implementation class
 # @internal
 # com.ibm.icu.text.LocaleDisplayNames.impl = com.ibm.icu.impl.LocaleDisplayNamesImpl
+
+#
+# [Internal Use Only]
+# Enable ML phrase breaking
+# @internal
+com.ibm.icu.impl.breakiter.useMLPhraseBreaking = false
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java
@@ -76,7 +76,7 @@ public String run() {
             val = System.getProperty(name);
         }
 
-        if (val == null) {
+        if (val == null || val.equals("")) {
             val = CONFIG_PROPS.getProperty(name, def);
         }
         return val;

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java
@@ -18,6 +18,7 @@
 import java.util.HashSet;
 
 import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUConfig;
 import com.ibm.icu.impl.ICUData;
 import com.ibm.icu.text.Normalizer;
 import com.ibm.icu.text.UnicodeSet;
@@ -31,6 +32,8 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
     private UnicodeSet fClosePunctuationSet;
     private DictionaryMatcher fDictionary = null;
     private HashSet<String> fSkipSet;
+    private MlBreakEngine fMlBreakEngine;
+    private boolean isCj = false;
 
     public CjkBreakEngine(boolean korean) throws IOException {
         fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
@@ -47,9 +50,16 @@ public CjkBreakEngine(boolean korean) throws IOException {
         if (korean) {
             setCharacters(fHangulWordSet);
         } else { //Chinese and Japanese
+            isCj = true;
             UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
             setCharacters(cjSet);
-            initializeJapanesePhraseParamater();
+            if (Boolean.parseBoolean(
+                    ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
+                fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
+                        fClosePunctuationSet);
+            } else {
+                initializeJapanesePhraseParamater();
+            }
         }
     }
 
@@ -151,6 +161,15 @@ public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int e
                 charPositions[numCodePts] = index;
             }
         }
+        // Use ML phrase breaking
+        if (Boolean.parseBoolean(
+                ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
+            // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
+            if (isPhraseBreaking && isCj) {
+                return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text,
+                        numCodePts, charPositions, foundBreaks);
+            }
+        }
 
         // From here on out, do the algorithm. Note that our indices
         // refer to indices within the normalized string.
@@ -276,10 +295,11 @@ public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int e
             // In phrase breaking, there has to be a breakpoint between Cj character and close
             // punctuation.
             // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
+            inText.setIndex(pos);
             if (pos > previous) {
                 if (pos != startPos
                         || (isPhraseBreaking && pos > 0
-                        && fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
+                        && fClosePunctuationSet.contains(previous32(inText)))) {
                     foundBreaks.push(charPositions[t_boundary[i]] + startPos);
                     correctedNumBreaks++;
                 }
@@ -294,7 +314,9 @@ public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int e
             // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
             // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
             if (isPhraseBreaking) {
-                if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(inText.setIndex(endPos))) {
+                inText.setIndex(endPos);
+                int current = current32(inText);
+                if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
                     foundBreaks.pop();
                     correctedNumBreaks--;
                 }