jpn_vert/jpn_vert.config

# Important configurations for CJK mode

# Fix https://github.com/tesseract-ocr/tesseract/issues/991
preserve_interword_spaces 1

# New Segmentation search params
language_model_ngram_on         1
segsearch_max_char_wh_ratio     1.3
language_model_ngram_space_delimited_language F
language_model_ngram_scale_factor 0.1
language_model_use_sigmoidal_certainty T
language_model_ngram_nonmatch_score -20

# always force word_associator to run, whereas "enable_assoc" is conditioned on
# permuter rejection, which may not be what we want until we finalize the
# dictionary issue for CJK
#force_word_assoc   T

# Chinese symbols are more complex than Latin.  Adjust the blob filtering
# thresholds so they do not get filtered out as noise or containers.
# Also force word segmentation to reduce the length of blob sequences.
edges_use_new_outline_complexity    T
edges_children_fix                  T
edges_max_children_per_outline     10
edges_max_children_layers           5
edges_children_count_limit       1000
tosp_force_wordbreak_on_punct       F
textord_force_make_prop_words       T
textord_noise_sizelimit             0.1
textord_noise_normratio             6
textord_max_noise_size              7
textord_noise_rejwords              F
textord_no_rejects                  T
textord_tabfind_vertical_text_ratio  0.1
textord_tabfind_aligned_gap_fraction 0.5
textord_tabvector_vertical_box_ratio  0.3
textord_tabvector_vertical_gap_fraction  1.5

# Make use of the fact that Chinese is monospaced, and incorporate that
# into character segmentation cost for search, and inject this cost to
# word rating.
classify_integer_matcher_multiplier 10
#classify_class_pruner_multiplier    15
assume_fixed_pitch_char_segment     T
chop_enable                         F
tessedit_char_blacklist             °
segment_nonalphabetic_script        1
allow_blob_division                 F

# Since we do not have a dictionary for Chinese yet, none of the results
# will be found in dawg.  We need to relax the stopping condition and turn
# off dictionary based penalties.  Instead, we get additional constraints
# from script consistency.
stopper_nondict_certainty_base   -2
segment_penalty_dict_nonword      1.0
segment_penalty_garbage           1.0

# Force use of a single x-height mode when the text is written horizontally.
# This information could come from the unicharset (script_has_xheight), but
# it is better in the config file so as to be available when training.
textord_single_height_mode        T

# Use character height as x-height, and estimate it from character pitch and
# kerning width.
textord_use_cjk_fp_model T