forked from tesseract-ocr/langdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jpn_vert.config
65 lines (57 loc) · 2.56 KB
/
jpn_vert.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Important configurations for CJK mode
# Fix https://github.com/tesseract-ocr/tesseract/issues/991
preserve_interword_spaces 1
# New Segmentation search params
language_model_ngram_on 1
segsearch_max_char_wh_ratio 1.3
language_model_ngram_space_delimited_language F
language_model_ngram_scale_factor 0.1
language_model_use_sigmoidal_certainty T
language_model_ngram_nonmatch_score -20
# always force word_associator to run, whereas "enable_assoc" is conditioned on
# permuter rejection, which may not be what we want until we finalize the
# dictionary issue for CJK
#force_word_assoc T
# Chinese symbols are more complex than Latin. Adjust the blob filtering
# thresholds so they do not get filtered out as noise or containers.
# Also force word segmentation to reduce the length of blob sequences.
edges_use_new_outline_complexity T
edges_children_fix T
edges_max_children_per_outline 10
edges_max_children_layers 5
edges_children_count_limit 1000
tosp_force_wordbreak_on_punct F
textord_force_make_prop_words T
textord_noise_sizelimit 0.1
textord_noise_normratio 6
textord_max_noise_size 7
textord_noise_rejwords F
textord_no_rejects T
textord_tabfind_vertical_text_ratio 0.1
textord_tabfind_aligned_gap_fraction 0.5
textord_tabvector_vertical_box_ratio 0.3
textord_tabvector_vertical_gap_fraction 1.5
# Make use of the fact that Chinese is monospaced, and incorporate that
# into character segmentation cost for search, and inject this cost to
# word rating.
classify_integer_matcher_multiplier 10
#classify_class_pruner_multiplier 15
assume_fixed_pitch_char_segment T
chop_enable F
tessedit_char_blacklist °
segment_nonalphabetic_script 1
allow_blob_division F
# Since we do not have a dictionary for Chinese yet, none of the results
# will be found in dawg. We need to relax the stopping condition and turn
# off dictionary based penalties. Instead, we get additional constraints
# from script consistency.
stopper_nondict_certainty_base -2
segment_penalty_dict_nonword 1.0
segment_penalty_garbage 1.0
# Force use of a single x-height mode when the text is written horizontally.
# This information could come from the unicharset (script_has_xheight), but
# it is better in the config file so as to be available when training.
textord_single_height_mode T
# Use character height as x-height, and estimate it from character pitch and
# kerning width.
textord_use_cjk_fp_model T