forked from tesseract-ocr/langdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chi_sim.config
47 lines (41 loc) · 1.92 KB
/
chi_sim.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
tessedit_load_sublangs chi_sim_vert
# Important configurations for CJK mode
# New Segmentation search params
language_model_ngram_on 1
segsearch_max_char_wh_ratio 1.3
language_model_ngram_space_delimited_language F
language_model_use_sigmoidal_certainty T
language_model_ngram_nonmatch_score -20
# Chinese symbols are more complex than Latin. Adjust the blob filtering
# thresholds so they do not get filtered out as noise or containers.
# Also force word segmentation to reduce the length of blob sequences.
edges_use_new_outline_complexity T
edges_children_fix T
edges_max_children_per_outline 10
edges_max_children_layers 5
edges_children_count_limit 1000
tosp_force_wordbreak_on_punct F
textord_force_make_prop_words T
textord_noise_sizelimit 0.2
textord_noise_normratio 6
textord_max_noise_size 7
# Make use of the fact that Chinese is monospaced, and incorporate that
# into character segmentation cost for search, and inject this cost to
# word rating.
classify_integer_matcher_multiplier 4
assume_fixed_pitch_char_segment T
tessedit_char_blacklist 氵宀灬丿丶ˇ幺扌亻〆‰囗
segment_nonalphabetic_script 1
allow_blob_division F
# Since we do not have a dictionary for Chinese yet, none of the results
# will be found in dawg. We need to relax the stopping condition and turn
# off dictionary based penalties. Instead, we get additional constraints
# from script consistency.
stopper_nondict_certainty_base -2
# Force use of a single x-height mode when the text is written horizontally.
# This information could come from the unicharset (script_has_xheight), but
# it is better in the config file so as to be available when training.
textord_single_height_mode T
# Use character height as x-height, and estimate it from character pitch and
# kerning width.
textord_use_cjk_fp_model T