forked from tesseract-ocr/langdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
grc.config
28 lines (22 loc) · 1.05 KB
/
grc.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# Tesseract Ancient Greek training <http://ancientgreekocr.org>
# Build from the http://ancientgreekocr.org/grctraining.git repository
# commit: f7959cbcb09e989381171198c266939e0d715488
#
# Wordlists derived from https://github.com/PerseusDL/canonical-greekLit
# commit: 5d069b29bd9dd40c8bb1dc1b9e2623236ebb22b9
# New segsearch produces better results
enable_new_segsearch 1
# Increase penalty for incorrect punctuation, important as
# diacritics can easily be misrecognised as punctuation
language_model_penalty_punc 0.35
# Increase minimum linesize. This minimises cases of accents
# being incorrectly recognised as separate lines.
textord_min_linesize 2.25
# Also helps to ensure that accents aren't incorrectly recognised
# as separate lines
textord_occupancy_threshold 0.7
# Helps to ensure rows don't overlap
textord_excess_blobsize 0.6
# Disable rare, variant, archaic and Greek numeral characters
# (can be enabled with tessedit_char_unblacklist)
tessedit_char_blacklist ͰͱͲͳʹ͵ͶͷͻͼͽϏϐϑϒϔϕϖϗϘϙϚϛϜϝϞϟϠϡϰϱϲϳϴϵ϶ϹϺϻϼϽϾϿ