forked from ocrmypdf/OCRmyPDF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hocrtransform.py
executable file
·482 lines (431 loc) · 16.5 KB
/
hocrtransform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2010 Jonathan Brinley
# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Transform .hocr and page image to text PDF."""
from __future__ import annotations
import argparse
import os
import re
import warnings
from math import atan, cos, sin
from pathlib import Path
from typing import Any, NamedTuple
from xml.etree import ElementTree
with warnings.catch_warnings():
# reportlab uses deprecated load_module
warnings.filterwarnings(
'ignore', category=DeprecationWarning, message=r".*load_module.*"
)
from reportlab.lib.colors import black, cyan, magenta, red
from reportlab.lib.units import inch
from reportlab.pdfgen.canvas import Canvas
# According to Wikipedia these languages are supported in the ISO-8859-1 character
# set, meaning reportlab can generate them and they are compatible with hocr,
# assuming Tesseract has the necessary languages installed. Note that there may
# not be language packs for them.
HOCR_OK_LANGS = frozenset(
[
# Languages fully covered by Latin-1:
'afr', # Afrikaans
'alb', # Albanian
'ast', # Leonese
'baq', # Basque
'bre', # Breton
'cos', # Corsican
'eng', # English
'eus', # Basque
'fao', # Faoese
'gla', # Scottish Gaelic
'glg', # Galician
'glv', # Manx
'ice', # Icelandic
'ind', # Indonesian
'isl', # Icelandic
'ita', # Italian
'ltz', # Luxembourgish
'mal', # Malay Rumi
'mga', # Irish
'nor', # Norwegian
'oci', # Occitan
'por', # Portugeuse
'roh', # Romansh
'sco', # Scots
'sma', # Sami
'spa', # Spanish
'sqi', # Albanian
'swa', # Swahili
'swe', # Swedish
'tgl', # Tagalog
'wln', # Walloon
# Languages supported by Latin-1 except for a few rare characters that OCR
# is probably not trained to recognize anyway:
'cat', # Catalan
'cym', # Welsh
'dan', # Danish
'deu', # German
'dut', # Dutch
'est', # Estonian
'fin', # Finnish
'fra', # French
'hun', # Hungarian
'kur', # Kurdish
'nld', # Dutch
'wel', # Welsh
]
)
Element = ElementTree.Element
class Rect(NamedTuple):
"""A rectangle for managing PDF coordinates."""
x1: Any
y1: Any
x2: Any
y2: Any
class HocrTransformError(Exception):
"""Error while applying hOCR transform."""
class HocrTransform:
"""
A class for converting documents from the hOCR format.
For details of the hOCR format, see:
http://kba.cloud/hocr-spec/
"""
box_pattern = re.compile(r'bbox((\s+\d+){4})')
baseline_pattern = re.compile(
r'''
baseline \s+
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
([\-\+]?\d+) # +/- int''',
re.VERBOSE,
)
ligatures = str.maketrans(
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'}
)
def __init__(self, *, hocr_filename: str | Path, dpi: float):
self.dpi = dpi
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
# if the hOCR file has a namespace, ElementTree requires its use to
# find elements
matches = re.match(r'({.*})html', self.hocr.getroot().tag)
self.xmlns = ''
if matches:
self.xmlns = matches.group(1)
# get dimension in pt (not pixel!!!!) of the OCRed image
self.width, self.height = None, None
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
coords = self.element_coordinates(div)
pt_coords = self.pt_from_pixel(coords)
self.width = pt_coords.x2 - pt_coords.x1
self.height = pt_coords.y2 - pt_coords.y1
# there shouldn't be more than one, and if there is, we don't want
# it
break
if self.width is None or self.height is None:
raise HocrTransformError("hocr file is missing page dimensions")
def __str__(self): # pragma: no cover
"""
Return the textual content of the HTML body
"""
if self.hocr is None:
return ''
body = self.hocr.find(self._child_xpath('body'))
if body:
return self._get_element_text(body)
else:
return ''
def _get_element_text(self, element: Element):
"""
Return the textual content of the element and its children
"""
text = ''
if element.text is not None:
text += element.text
for child in element:
text += self._get_element_text(child)
if element.tail is not None:
text += element.tail
return text
@classmethod
def element_coordinates(cls, element: Element) -> Rect:
"""
Returns a tuple containing the coordinates of the bounding box around
an element
"""
out = Rect._make(0 for _ in range(4))
if 'title' in element.attrib:
matches = cls.box_pattern.search(element.attrib['title'])
if matches:
coords = matches.group(1).split()
out = Rect._make(int(coords[n]) for n in range(4))
return out
@classmethod
def baseline(cls, element: Element) -> tuple[float, float]:
"""
Returns a tuple containing the baseline slope and intercept.
"""
if 'title' in element.attrib:
matches = cls.baseline_pattern.search(element.attrib['title'])
if matches:
return float(matches.group(1)), int(matches.group(2))
return (0.0, 0.0)
def pt_from_pixel(self, pxl) -> Rect:
"""
Returns the quantity in PDF units (pt) given quantity in pixels
"""
return Rect._make((c / self.dpi * inch) for c in pxl)
def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
xpath = f".//{self.xmlns}{html_tag}"
if html_class:
xpath += f"[@class='{html_class}']"
return xpath
@classmethod
def replace_unsupported_chars(cls, s: str) -> str:
"""
Given an input string, returns the corresponding string that:
* is available in the Helvetica facetype
* does not contain any ligature (to allow easy search in the PDF file)
"""
return s.translate(cls.ligatures)
def topdown_position(self, element):
pxl_line_coords = self.element_coordinates(element)
line_box = self.pt_from_pixel(pxl_line_coords)
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis
# is the top of the page and increasing values of y will move towards the
# bottom of the page.
return line_box.y2
def to_pdf(
self,
*,
out_filename: Path,
image_filename: Path | None = None,
show_bounding_boxes: bool = False,
fontname: str = "Helvetica",
invisible_text: bool = False,
interword_spaces: bool = False,
) -> None:
"""
Creates a PDF file with an image superimposed on top of the text.
Text is positioned according to the bounding box of the lines in
the hOCR file.
The image need not be identical to the image used to create the hOCR
file.
It can have a lower resolution, different color mode, etc.
Arguments:
out_filename: Path of PDF to write.
image_filename: Image to use for this file. If omitted, the OCR text
is shown.
show_bounding_boxes: Show bounding boxes around various text regions,
for debugging.
fontname: Name of font to use.
invisible_text: If True, text is rendered invisible so that is
selectable but never drawn. If False, text is visible and may
be seen if the image is skipped or deleted in Acrobat.
interword_spaces: If True, insert spaces between words rather than
drawing each word without spaces. Generally this improves text
extraction.
"""
# create the PDF file
# page size in points (1/72 in.)
pdf = Canvas(
os.fspath(out_filename),
pagesize=(self.width, self.height),
pageCompression=1,
)
# draw bounding box for each paragraph
# light blue for bounding box of paragraph
pdf.setStrokeColor(cyan)
# light blue for bounding box of paragraph
pdf.setFillColor(cyan)
pdf.setLineWidth(0) # no line for bounding box
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
elemtxt = self._get_element_text(elem).rstrip()
if len(elemtxt) == 0:
continue
pxl_coords = self.element_coordinates(elem)
pt = self.pt_from_pixel(pxl_coords) # pylint: disable=invalid-name
# draw the bbox border
if show_bounding_boxes: # pragma: no cover
pdf.rect(
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1
)
found_lines = False
for line in (
element
for element in self.hocr.iterfind(self._child_xpath('span'))
if 'class' in element.attrib
and element.attrib['class'] in {'ocr_header', 'ocr_line', 'ocr_textfloat'}
):
found_lines = True
self._do_line(
pdf,
line,
"ocrx_word",
fontname,
invisible_text,
interword_spaces,
show_bounding_boxes,
)
if not found_lines:
# Tesseract did not report any lines (just words)
root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
self._do_line(
pdf,
root,
"ocrx_word",
fontname,
invisible_text,
interword_spaces,
show_bounding_boxes,
)
# put the image on the page, scaled to fill the page
if image_filename is not None:
pdf.drawImage(
os.fspath(image_filename), 0, 0, width=self.width, height=self.height
)
# finish up the page and save it
pdf.showPage()
pdf.save()
@classmethod
def polyval(cls, poly, x): # pragma: no cover
return x * poly[0] + poly[1]
def _do_line(
self,
pdf: Canvas,
line: Element | None,
elemclass: str,
fontname: str,
invisible_text: bool,
interword_spaces: bool,
show_bounding_boxes: bool,
):
if line is None:
return
pxl_line_coords = self.element_coordinates(line)
line_box = self.pt_from_pixel(pxl_line_coords)
line_height = line_box.y2 - line_box.y1
slope, pxl_intercept = self.baseline(line)
if abs(slope) < 0.005:
slope = 0.0
angle = atan(slope)
cos_a, sin_a = cos(angle), sin(angle)
text = pdf.beginText()
intercept = pxl_intercept / self.dpi * inch
# Don't allow the font to break out of the bounding box. Division by
# cos_a accounts for extra clearance between the glyph's vertical axis
# on a sloped baseline and the edge of the bounding box.
fontsize = (line_height - abs(intercept)) / cos_a
text.setFont(fontname, fontsize)
if invisible_text:
text.setTextRenderMode(3) # Invisible (indicates OCR text)
# Intercept is normally negative, so this places it above the bottom
# of the line box
baseline_y2 = self.height - (line_box.y2 + intercept)
if show_bounding_boxes: # pragma: no cover
# draw the baseline in magenta, dashed
pdf.setDash()
pdf.setStrokeColor(magenta)
pdf.setLineWidth(0.5)
# negate slope because it is defined as a rise/run in pixel
# coordinates and page coordinates have the y axis flipped
pdf.line(
line_box.x1,
baseline_y2,
line_box.x2,
self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1),
)
# light green for bounding box of word/line
pdf.setDash(6, 3)
pdf.setStrokeColor(red)
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2)
pdf.setFillColor(black) # text in black
elements = line.findall(self._child_xpath('span', elemclass))
for elem in elements:
elemtxt = self._get_element_text(elem).strip()
elemtxt = self.replace_unsupported_chars(elemtxt)
if elemtxt == '':
continue
pxl_coords = self.element_coordinates(elem)
box = self.pt_from_pixel(pxl_coords)
if interword_spaces:
# if `--interword-spaces` is true, append a space
# to the end of each text element to allow simpler PDF viewers
# such as PDF.js to better recognize words in search and copy
# and paste. Do not remove space from last word in line, even
# though it would look better, because it will interfere with
# naive text extraction. \n does not work either.
elemtxt += ' '
box = Rect._make(
(
box.x1,
line_box.y1,
box.x2 + pdf.stringWidth(' ', fontname, line_height),
line_box.y2,
)
)
box_width = box.x2 - box.x1
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
# draw the bbox border
if show_bounding_boxes: # pragma: no cover
pdf.rect(
box.x1, self.height - line_box.y2, box_width, line_height, fill=0
)
# Adjust relative position of cursor
# This is equivalent to:
# text.setTextOrigin(pt.x1, self.height - line_box.y2)
# but the former generates a full text reposition matrix (Tm) in the
# content stream while this issues a "offset" (Td) command.
# .moveCursor() is relative to start of the text line, where the
# "text line" means whatever reportlab defines it as. Do not use
# use .getCursor(), since moveCursor() rather unintuitively plans
# its moves relative to .getStartOfLine().
# For skewed lines, in the text transform we set up a rotated
# coordinate system, so we don't have to account for the
# incremental offset. Surprisingly most PDF viewers can handle this.
cursor = text.getStartOfLine()
dx = box.x1 - cursor[0]
dy = baseline_y2 - cursor[1]
text.moveCursor(dx, dy)
# If reportlab tells us this word is 0 units wide, our best seems
# to be to suppress this text
if font_width > 0:
text.setHorizScale(100 * box_width / font_width)
text.textOut(elemtxt)
pdf.drawText(text)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert hocr file to PDF')
parser.add_argument(
'-b',
'--boundingboxes',
action="store_true",
default=False,
help='Show bounding boxes borders',
)
parser.add_argument(
'-r',
'--resolution',
type=int,
default=300,
help='Resolution of the image that was OCRed',
)
parser.add_argument(
'-i',
'--image',
default=None,
help='Path to the image to be placed above the text',
)
parser.add_argument(
'--interword-spaces',
action='store_true',
default=False,
help='Add spaces between words',
)
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
args = parser.parse_args()
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution)
hocr.to_pdf(
out_filename=args.outputfile,
image_filename=args.image,
show_bounding_boxes=args.boundingboxes,
interword_spaces=args.interword_spaces,
)