Updates to desired/forbidden characters to include Arabic diacritcs, …

…extra Devanagari characters vertical Chinese, Japanese, some more sublang defaults
crosindh · Jan 13, 2017 · 3ab6581 · 3ab6581
1 parent 3000efe
commit 3ab6581
Show file tree

Hide file tree

Showing 42 changed files with 994 additions and 20,859 deletions.
diff --git a/ara/desired_characters b/ara/desired_characters
@@ -0,0 +1,8 @@
+ّ
+َ
+ً
+ُ
+ٌ
+ِ
+ٍ
+ْ
diff --git a/ara/forbidden_characters b/ara/forbidden_characters
@@ -13,7 +13,6 @@
 0x2015
 0x2043
 0x0600-0x0615
-0x064b-0x065f
 0x0670-0x06ff
 0x0000-0x001f
 0x0080-0x00a0

diff --git a/chi_sim/chi_sim.config b/chi_sim/chi_sim.config
@@ -1,3 +1,4 @@
+tessedit_load_sublangs chi_sim_vert
 # Important configurations for CJK mode
 
 # New Segmentation search params

diff --git a/chi_sim_vert/chi_sim_vert.config b/chi_sim_vert/chi_sim_vert.config
@@ -0,0 +1,46 @@
+# Important configurations for CJK mode
+
+# New Segmentation search params
+language_model_ngram_on             1
+segsearch_max_char_wh_ratio         1.3
+language_model_ngram_space_delimited_language F
+language_model_use_sigmoidal_certainty T
+language_model_ngram_nonmatch_score -20
+
+# Chinese symbols are more complex than Latin.  Adjust the blob filtering
+# thresholds so they do not get filtered out as noise or containers.
+# Also force word segmentation to reduce the length of blob sequences.
+edges_use_new_outline_complexity    T
+edges_children_fix                  T
+edges_max_children_per_outline     10
+edges_max_children_layers           5
+edges_children_count_limit       1000
+tosp_force_wordbreak_on_punct       F
+textord_force_make_prop_words       T
+textord_noise_sizelimit             0.2
+textord_noise_normratio             6
+textord_max_noise_size              7
+
+# Make use of the fact that Chinese is monospaced, and incorporate that
+# into character segmentation cost for search, and inject this cost to
+# word rating.
+classify_integer_matcher_multiplier 4
+assume_fixed_pitch_char_segment     T
+tessedit_char_blacklist             氵宀灬丿丶ˇ幺扌亻〆‰囗
+segment_nonalphabetic_script        1
+allow_blob_division                 F
+
+# Since we do not have a dictionary for Chinese yet, none of the results
+# will be found in dawg.  We need to relax the stopping condition and turn
+# off dictionary based penalties.  Instead, we get additional constraints
+# from script consistency.
+stopper_nondict_certainty_base   -2
+
+# Force use of a single x-height mode when the text is written horizontally.
+# This information could come from the unicharset (script_has_xheight), but
+# it is better in the config file so as to be available when training.
+textord_single_height_mode        T
+
+# Use character height as x-height, and estimate it from character pitch and
+# kerning width.
+textord_use_cjk_fp_model T
diff --git a/chi_sim_vert/desired_characters b/chi_sim_vert/desired_characters
@@ -0,0 +1,93 @@
+!
+"
+#
+$
+%
+&
+*
+'
+(
+)
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+…
+，
diff --git a/chi_sim_vert/forbidden_characters b/chi_sim_vert/forbidden_characters
@@ -0,0 +1,28 @@
+0x0000-0x001f
+0x0080-0x00ff
+0x2010
+0x2012
+0x2013
+0x2014
+0x2015
+0x201a
+0x2022
+0x2043
+0x20d0-0x214f
+0x2160-0x2bff
+0x2e80-0x2eff
+0x2f00-0x2fdf
+0x3000
+0x3003
+0x3013
+0x3004-0x3006
+0x3020-0x303f
+0x3094-0x309f
+0x30fb-0x30fe
+0x3100-0x312f
+0x3200-0x33ff
+0xd800-0xf8ff
+0xf900-0xfaff
+0xfb50-0xfe0f
+0xfe70-0xfeff
+0xff00-0xffef
diff --git a/chi_tra/chi_tra.config b/chi_tra/chi_tra.config
@@ -1,3 +1,4 @@
+tessedit_load_sublangs chi_tra_vert
 # Important configurations for CJK mode
 
 # New Segmentation search params

diff --git a/chi_tra_vert/chi_tra_vert.config b/chi_tra_vert/chi_tra_vert.config
@@ -0,0 +1,48 @@
+# Important configurations for CJK mode
+
+# New Segmentation search params
+language_model_ngram_on             1
+segsearch_max_char_wh_ratio         1.3
+language_model_ngram_space_delimited_language F
+language_model_use_sigmoidal_certainty T
+language_model_ngram_nonmatch_score -20
+
+# Chinese symbols are more complex than Latin.  Adjust the blob filtering
+# thresholds so they do not get filtered out as noise or containers.
+# Also force word segmentation to reduce the length of blob sequences.
+edges_use_new_outline_complexity    T
+edges_children_fix                  T
+edges_max_children_per_outline     20
+edges_max_children_layers           5
+edges_children_count_limit       10000
+tosp_force_wordbreak_on_punct       F
+textord_force_make_prop_words       T
+textord_noise_sizelimit             0.2
+textord_noise_normratio             5
+textord_max_noise_size              7
+
+# Make use of the fact that Chinese is monospaced, and incorporate that
+# into character segmentation cost for search, and inject this cost to
+# word rating.
+classify_integer_matcher_multiplier 6
+assume_fixed_pitch_char_segment     T
+tessedit_char_blacklist             氵宀灬丿丶ˇ幺扌亻〆‰囗
+segment_nonalphabetic_script        1
+allow_blob_division                 F
+
+# Since we do not have a dictionary for Chinese yet, none of the results
+# will be found in dawg.  We need to relax the stopping condition and turn
+# off dictionary based penalties.  Instead, we get additional constraints
+# from script consistency.
+stopper_nondict_certainty_base   -2
+segment_penalty_dict_nonword      1.0
+segment_penalty_garbage           1.0
+
+# Force use of a single x-height mode when the text is written horizontally.
+# This information could come from the unicharset (script_has_xheight), but
+# it is better in the config file so as to be available when training.
+textord_single_height_mode        T
+
+# Use character height as x-height, and estimate it from character pitch and
+# kerning width.
+textord_use_cjk_fp_model T
diff --git a/chi_tra_vert/desired_characters b/chi_tra_vert/desired_characters
@@ -0,0 +1,93 @@
+!
+"
+#
+$
+%
+&
+*
+'
+(
+)
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+…
+，
diff --git a/chi_tra_vert/forbidden_characters b/chi_tra_vert/forbidden_characters
@@ -0,0 +1,28 @@
+0x0000-0x001f
+0x0080-0x00ff
+0x2010
+0x2012
+0x2013
+0x2014
+0x2015
+0x201a
+0x2022
+0x2043
+0x20d0-0x214f
+0x2160-0x2bff
+0x2e80-0x2eff
+0x2f00-0x2fdf
+0x3000
+0x3003
+0x3013
+0x3004-0x3006
+0x3020-0x303f
+0x3094-0x309f
+0x30fb-0x30fe
+0x3100-0x312f
+0x3200-0x33ff
+0xd800-0xf8ff
+0xf900-0xfaff
+0xfb50-0xfe0f
+0xfe70-0xfeff
+0xff00-0xffef
diff --git a/div/desired_characters b/div/desired_characters
@@ -0,0 +1,8 @@
+ّ
+َ
+ً
+ُ
+ٌ
+ِ
+ٍ
+ْ