Skip to content

Commit

Permalink
Updates to desired/forbidden characters to include Arabic diacritcs, …
Browse files Browse the repository at this point in the history
…extra Devanagari characters vertical Chinese, Japanese, some more sublang defaults
  • Loading branch information
theraysmith committed Jan 13, 2017
1 parent 3000efe commit 3ab6581
Show file tree
Hide file tree
Showing 42 changed files with 994 additions and 20,859 deletions.
8 changes: 8 additions & 0 deletions ara/desired_characters
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ّ
َ
ً
ُ
ٌ
ِ
ٍ
ْ
1 change: 0 additions & 1 deletion ara/forbidden_characters
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
0x2015
0x2043
0x0600-0x0615
0x064b-0x065f
0x0670-0x06ff
0x0000-0x001f
0x0080-0x00a0
Expand Down
1 change: 1 addition & 0 deletions chi_sim/chi_sim.config
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
tessedit_load_sublangs chi_sim_vert
# Important configurations for CJK mode

# New Segmentation search params
Expand Down
46 changes: 46 additions & 0 deletions chi_sim_vert/chi_sim_vert.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Important configurations for CJK mode

# New Segmentation search params
language_model_ngram_on 1
segsearch_max_char_wh_ratio 1.3
language_model_ngram_space_delimited_language F
language_model_use_sigmoidal_certainty T
language_model_ngram_nonmatch_score -20

# Chinese symbols are more complex than Latin. Adjust the blob filtering
# thresholds so they do not get filtered out as noise or containers.
# Also force word segmentation to reduce the length of blob sequences.
edges_use_new_outline_complexity T
edges_children_fix T
edges_max_children_per_outline 10
edges_max_children_layers 5
edges_children_count_limit 1000
tosp_force_wordbreak_on_punct F
textord_force_make_prop_words T
textord_noise_sizelimit 0.2
textord_noise_normratio 6
textord_max_noise_size 7

# Make use of the fact that Chinese is monospaced, and incorporate that
# into character segmentation cost for search, and inject this cost to
# word rating.
classify_integer_matcher_multiplier 4
assume_fixed_pitch_char_segment T
tessedit_char_blacklist 氵宀灬丿丶ˇ幺扌亻〆‰囗
segment_nonalphabetic_script 1
allow_blob_division F

# Since we do not have a dictionary for Chinese yet, none of the results
# will be found in dawg. We need to relax the stopping condition and turn
# off dictionary based penalties. Instead, we get additional constraints
# from script consistency.
stopper_nondict_certainty_base -2

# Force use of a single x-height mode when the text is written horizontally.
# This information could come from the unicharset (script_has_xheight), but
# it is better in the config file so as to be available when training.
textord_single_height_mode T

# Use character height as x-height, and estimate it from character pitch and
# kerning width.
textord_use_cjk_fp_model T
93 changes: 93 additions & 0 deletions chi_sim_vert/desired_characters
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
!
"
#
$
%
&
*
'
(
)
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
\
]
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
28 changes: 28 additions & 0 deletions chi_sim_vert/forbidden_characters
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
0x0000-0x001f
0x0080-0x00ff
0x2010
0x2012
0x2013
0x2014
0x2015
0x201a
0x2022
0x2043
0x20d0-0x214f
0x2160-0x2bff
0x2e80-0x2eff
0x2f00-0x2fdf
0x3000
0x3003
0x3013
0x3004-0x3006
0x3020-0x303f
0x3094-0x309f
0x30fb-0x30fe
0x3100-0x312f
0x3200-0x33ff
0xd800-0xf8ff
0xf900-0xfaff
0xfb50-0xfe0f
0xfe70-0xfeff
0xff00-0xffef
1 change: 1 addition & 0 deletions chi_tra/chi_tra.config
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
tessedit_load_sublangs chi_tra_vert
# Important configurations for CJK mode

# New Segmentation search params
Expand Down
48 changes: 48 additions & 0 deletions chi_tra_vert/chi_tra_vert.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Important configurations for CJK mode

# New Segmentation search params
language_model_ngram_on 1
segsearch_max_char_wh_ratio 1.3
language_model_ngram_space_delimited_language F
language_model_use_sigmoidal_certainty T
language_model_ngram_nonmatch_score -20

# Chinese symbols are more complex than Latin. Adjust the blob filtering
# thresholds so they do not get filtered out as noise or containers.
# Also force word segmentation to reduce the length of blob sequences.
edges_use_new_outline_complexity T
edges_children_fix T
edges_max_children_per_outline 20
edges_max_children_layers 5
edges_children_count_limit 10000
tosp_force_wordbreak_on_punct F
textord_force_make_prop_words T
textord_noise_sizelimit 0.2
textord_noise_normratio 5
textord_max_noise_size 7

# Make use of the fact that Chinese is monospaced, and incorporate that
# into character segmentation cost for search, and inject this cost to
# word rating.
classify_integer_matcher_multiplier 6
assume_fixed_pitch_char_segment T
tessedit_char_blacklist 氵宀灬丿丶ˇ幺扌亻〆‰囗
segment_nonalphabetic_script 1
allow_blob_division F

# Since we do not have a dictionary for Chinese yet, none of the results
# will be found in dawg. We need to relax the stopping condition and turn
# off dictionary based penalties. Instead, we get additional constraints
# from script consistency.
stopper_nondict_certainty_base -2
segment_penalty_dict_nonword 1.0
segment_penalty_garbage 1.0

# Force use of a single x-height mode when the text is written horizontally.
# This information could come from the unicharset (script_has_xheight), but
# it is better in the config file so as to be available when training.
textord_single_height_mode T

# Use character height as x-height, and estimate it from character pitch and
# kerning width.
textord_use_cjk_fp_model T
93 changes: 93 additions & 0 deletions chi_tra_vert/desired_characters
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
!
"
#
$
%
&
*
'
(
)
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
\
]
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
28 changes: 28 additions & 0 deletions chi_tra_vert/forbidden_characters
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
0x0000-0x001f
0x0080-0x00ff
0x2010
0x2012
0x2013
0x2014
0x2015
0x201a
0x2022
0x2043
0x20d0-0x214f
0x2160-0x2bff
0x2e80-0x2eff
0x2f00-0x2fdf
0x3000
0x3003
0x3013
0x3004-0x3006
0x3020-0x303f
0x3094-0x309f
0x30fb-0x30fe
0x3100-0x312f
0x3200-0x33ff
0xd800-0xf8ff
0xf900-0xfaff
0xfb50-0xfe0f
0xfe70-0xfeff
0xff00-0xffef
8 changes: 8 additions & 0 deletions div/desired_characters
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ّ
َ
ً
ُ
ٌ
ِ
ٍ
ْ
Loading

0 comments on commit 3ab6581

Please sign in to comment.