forked from chromium/chromium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfium_page.h
472 lines (400 loc) · 17.2 KB
/
pdfium_page.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
// Copyright 2010 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef PDF_PDFIUM_PDFIUM_PAGE_H_
#define PDF_PDFIUM_PDFIUM_PAGE_H_
#include <map>
#include <set>
#include <string>
#include <vector>
#include "base/functional/callback.h"
#include "base/functional/callback_forward.h"
#include "base/gtest_prod_util.h"
#include "base/memory/raw_ptr.h"
#include "pdf/page_orientation.h"
#include "pdf/pdf_engine.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
#include "third_party/pdfium/public/cpp/fpdf_scopers.h"
#include "third_party/pdfium/public/fpdf_doc.h"
#include "third_party/pdfium/public/fpdf_formfill.h"
#include "third_party/pdfium/public/fpdf_text.h"
#include "third_party/skia/include/core/SkBitmap.h"
#include "ui/gfx/geometry/point_f.h"
#include "ui/gfx/geometry/rect.h"
namespace gfx {
class Point;
class RectF;
} // namespace gfx
namespace chrome_pdf {
class PDFiumEngine;
class Thumbnail;
struct AccessibilityHighlightInfo;
struct AccessibilityImageInfo;
struct AccessibilityLinkInfo;
struct AccessibilityTextFieldInfo;
struct AccessibilityTextRunInfo;
// Wrapper around a page from the document.
class PDFiumPage {
public:
PDFiumPage(PDFiumEngine* engine, int i);
PDFiumPage(const PDFiumPage&) = delete;
PDFiumPage& operator=(const PDFiumPage&) = delete;
PDFiumPage(PDFiumPage&& that);
~PDFiumPage();
// Unloads the PDFium data for this page from memory.
void Unload();
// Gets the FPDF_PAGE for this page, loading and parsing it if necessary.
FPDF_PAGE GetPage();
// Returns FPDF_TEXTPAGE for the page, loading and parsing it if necessary.
FPDF_TEXTPAGE GetTextPage();
// See definition of PDFEngine::GetTextRunInfo().
absl::optional<AccessibilityTextRunInfo> GetTextRunInfo(int start_char_index);
// Get a unicode character from the page.
uint32_t GetCharUnicode(int char_index);
// Get the bounds of a character in page pixels.
gfx::RectF GetCharBounds(int char_index);
// Get the bounds of the page with the crop box applied, in page pixels.
gfx::RectF GetCroppedRect();
// Get the bounding box of the page in page pixels. The bounding box is the
// largest rectangle containing all visible content in the effective crop box.
// If the bounding box can't be calculated, returns the effective crop box.
// The resulting bounding box is relative to the effective crop box.
gfx::RectF GetBoundingBox();
// Returns if the character at `char_index` is within `page_bounds`.
bool IsCharInPageBounds(int char_index, const gfx::RectF& page_bounds);
// For all the links on the page, get their urls, underlying text ranges and
// bounding boxes.
std::vector<AccessibilityLinkInfo> GetLinkInfo(
const std::vector<AccessibilityTextRunInfo>& text_runs);
// For all the images on the page, get their alt texts and bounding boxes. If
// the alt text is empty or unavailable, and if the user has requested that
// the OCR service tag the PDF so that it is made accessible, transfer the raw
// image pixels in the `image_data` field. Otherwise do not populate the
// `image_data` field.
std::vector<AccessibilityImageInfo> GetImageInfo(uint32_t text_run_count);
// For all the highlights on the page, get their underlying text ranges and
// bounding boxes.
std::vector<AccessibilityHighlightInfo> GetHighlightInfo(
const std::vector<AccessibilityTextRunInfo>& text_runs);
// For all the text fields on the page, get their properties like name,
// value, bounding boxes, etc.
std::vector<AccessibilityTextFieldInfo> GetTextFieldInfo(
uint32_t text_run_count);
enum Area {
NONSELECTABLE_AREA,
TEXT_AREA, // Area contains regular, selectable text not
// within form fields.
WEBLINK_AREA, // Area is a hyperlink.
DOCLINK_AREA, // Area is a link to a different part of the same
// document.
FORM_TEXT_AREA, // Area is a form text field or form combobox text
// field.
};
struct LinkTarget {
LinkTarget();
LinkTarget(const LinkTarget& other);
~LinkTarget();
// Valid for WEBLINK_AREA only.
std::string url;
// Valid for DOCLINK_AREA only.
int page;
// Valid for DOCLINK_AREA only. From the top-left of the page.
absl::optional<float> x_in_pixels;
absl::optional<float> y_in_pixels;
// Valid for DOCLINK_AREA only.
absl::optional<float> zoom;
};
// Given a `link_index`, returns the type of underlying area and the link
// target. `target` must be valid. Returns NONSELECTABLE_AREA if
// `link_index` is invalid.
Area GetLinkTargetAtIndex(int link_index, LinkTarget* target);
// Returns link type and fills target associated with a link. Returns
// NONSELECTABLE_AREA if link detection failed.
Area GetLinkTarget(FPDF_LINK link, LinkTarget* target);
// Fills the output params with the in-page coordinates and the zoom value of
// the destination.
void GetPageDestinationTarget(FPDF_DEST destination,
absl::optional<float>* dest_x,
absl::optional<float>* dest_y,
absl::optional<float>* zoom_value);
// For a named destination with "XYZ" view fit type, pre-processes the in-page
// x/y coordinate in case it's out of the range of the page dimension. Then
// transform it to a screen coordinate.
float PreProcessAndTransformInPageCoordX(float x);
float PreProcessAndTransformInPageCoordY(float y);
// Transforms an (x, y) position in page coordinates to screen coordinates.
gfx::PointF TransformPageToScreenXY(const gfx::PointF& xy);
// Transforms an in-page x coordinate to its value in screen coordinates.
float TransformPageToScreenX(float x);
// Transforms an in-page y coordinate to its value in screen coordinates.
float TransformPageToScreenY(float y);
// Given a point in the document that's in this page, returns its character
// index if it's near a character, and also the type of text.
// Target is optional. It will be filled in for WEBLINK_AREA or
// DOCLINK_AREA only.
Area GetCharIndex(const gfx::Point& point,
PageOrientation orientation,
int* char_index,
int* form_type,
LinkTarget* target);
// Converts a form type to its corresponding Area.
static Area FormTypeToArea(int form_type);
// Gets the character at the given index.
char16_t GetCharAtIndex(int index);
// Gets the number of characters in the page.
int GetCharCount();
// Returns true if the given `char_index` lies within the character range
// of the page.
bool IsCharIndexInBounds(int char_index);
// Given a rectangle in page coordinates, computes the range of continuous
// characters which lie inside that rectangle. Returns false without
// modifying the out parameters if no character lies inside the rectangle.
bool GetUnderlyingTextRangeForRect(const gfx::RectF& rect,
int* start_index,
int* char_len);
// Converts from page coordinates to screen coordinates.
gfx::Rect PageToScreen(const gfx::Point& page_point,
double zoom,
double left,
double top,
double right,
double bottom,
PageOrientation orientation) const;
// Sets the callbacks for sending the thumbnail.
void RequestThumbnail(float device_pixel_ratio,
SendThumbnailCallback send_callback);
// Generates a page thumbnail accommodating a specific `device_pixel_ratio`.
Thumbnail GenerateThumbnail(float device_pixel_ratio);
int index() const { return index_; }
const gfx::Rect& rect() const { return rect_; }
void set_rect(const gfx::Rect& r) { rect_ = r; }
// Availability is a one-way transition: A page can become available, but it
// cannot become unavailable (unless deleted entirely).
bool available() const { return available_; }
void MarkAvailable();
void set_calculated_links(bool calculated_links) {
calculated_links_ = calculated_links;
}
FPDF_PAGE page() const { return page_.get(); }
FPDF_TEXTPAGE text_page() const { return text_page_.get(); }
private:
friend class PDFiumPageLinkTest;
friend class PDFiumTestBase;
FRIEND_TEST_ALL_PREFIXES(PDFiumPageButtonTest, PopulateButtons);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageChoiceFieldTest, PopulateChoiceFields);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageHighlightTest, PopulateHighlights);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, CalculateImages);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, ImageAltText);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, ImageData);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, AnnotLinkGeneration);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, GetLinkTarget);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, GetUTF8LinkTarget);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, LinkGeneration);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageOverlappingTest, CountCompleteOverlaps);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageOverlappingTest, CountPartialOverlaps);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageTextFieldTest, PopulateTextFields);
class ScopedUnloadPreventer {
public:
explicit ScopedUnloadPreventer(PDFiumPage* page);
~ScopedUnloadPreventer();
private:
const raw_ptr<PDFiumPage> page_;
};
struct Link {
Link();
Link(const Link& that);
~Link();
// Represents start index of underlying text range. Should be -1 if the link
// is not over text.
int32_t start_char_index = -1;
// Represents the number of characters that the link overlaps with.
int32_t char_count = 0;
std::vector<gfx::Rect> bounding_rects;
LinkTarget target;
};
// Represents an Image inside the page.
struct Image {
Image();
Image(const Image& other);
~Image();
int page_object_index;
// Alt text is available only for PDFs that are tagged for accessibility.
std::string alt_text;
gfx::Rect bounding_rect;
// Image data is only stored if the user has requested that the OCR service
// try to retrieve textual and layout information from this image. The
// bitmap will have the same size as the image in the PDF file, and will
// not be scaled.
SkBitmap image_data;
};
// Represents a highlight within the page.
struct Highlight {
Highlight();
Highlight(const Highlight& other);
~Highlight();
// Start index of underlying text range. -1 indicates invalid value.
int32_t start_char_index = -1;
// Number of characters encompassed by this highlight.
int32_t char_count = 0;
gfx::Rect bounding_rect;
// Color of the highlight in ARGB. Alpha is stored in the first 8 MSBs. RGB
// follows after it with each using 8 bytes.
uint32_t color;
// Text of the popup note associated with highlight.
std::string note_text;
};
// Represents a form field within the page.
struct FormField {
FormField();
FormField(const FormField& other);
~FormField();
gfx::Rect bounding_rect;
// Represents the name of form field as defined in the field dictionary.
std::string name;
// Represents the flags of form field as defined in the field dictionary.
int flags;
};
// Represents a text field within the page.
struct TextField : FormField {
TextField();
TextField(const TextField& other);
~TextField();
std::string value;
};
// Represents a choice field option.
struct ChoiceFieldOption {
ChoiceFieldOption();
ChoiceFieldOption(const ChoiceFieldOption& other);
~ChoiceFieldOption();
std::string name;
bool is_selected;
};
// Represents a choice field within the page.
struct ChoiceField : FormField {
ChoiceField();
ChoiceField(const ChoiceField& other);
~ChoiceField();
std::vector<ChoiceFieldOption> options;
};
// Represents a button within the page.
struct Button : FormField {
Button();
Button(const Button& other);
~Button();
std::string value;
// A button can be of type radio, checkbox or push button.
int type;
// Represents if the radio button or checkbox is checked.
bool is_checked = false;
// Represents count of controls in the control group. A group of
// interactive form annotations is collectively called a form control
// group. Here an interactive form annotation should be either a radio
// button or a checkbox.
uint32_t control_count = 0;
// Represents index of the control in the control group. A group of
// interactive form annotations is collectively called a form control
// group. Here an interactive form annotation should be either a radio
// button or a checkbox. Value of `control_index` is -1 for push button.
int control_index = -1;
};
// Returns a link index if the given character index is over a link, or -1
// otherwise.
int GetLink(int char_index, LinkTarget* target);
// Calculate the locations of any links on the page.
void CalculateLinks();
// Populates weblinks on the page.
void PopulateWebLinks();
// Populates annotation links on the page.
void PopulateAnnotationLinks();
// Calculate the locations of images on the page.
void CalculateImages();
// Populate annotations like highlight and text field on the page.
void PopulateAnnotations();
// Populate `highlights_` with `annot`.
void PopulateHighlight(FPDF_ANNOTATION annot);
// Populate `text_fields_` with `annot`.
void PopulateTextField(FPDF_ANNOTATION annot);
// Populate `choice_fields_` with `annot`.
void PopulateChoiceField(FPDF_ANNOTATION annot);
// Populate `buttons_` with `annot`.
void PopulateButton(FPDF_ANNOTATION annot);
// Populate form fields like text field, choice field and button on the page.
void PopulateFormField(FPDF_ANNOTATION annot);
// Returns link type and fills target associated with a destination. Returns
// NONSELECTABLE_AREA if detection failed.
Area GetDestinationTarget(FPDF_DEST destination, LinkTarget* target);
// Returns link type and fills target associated with a URI action. Returns
// NONSELECTABLE_AREA if detection failed.
Area GetURITarget(FPDF_ACTION uri_action, LinkTarget* target) const;
// Calculates the set of character indices on which text runs need to be
// broken for page objects such as links and images.
void CalculatePageObjectTextRunBreaks();
// Key : Marked content id for the image element as specified in the
// struct tree.
// Value : Index of image in the `images_` vector.
using MarkedContentIdToImageMap = std::map<int, size_t>;
// Traverses the entire struct tree of the page recursively and extracts the
// alt text from struct tree elements corresponding to the marked content IDs
// present in `marked_content_id_image_map`.
void PopulateImageAltText(
const MarkedContentIdToImageMap& marked_content_id_image_map);
// Traverses a struct element and its sub-tree recursively and extracts the
// alt text from struct elements corresponding to the marked content IDs
// present in `marked_content_id_image_map`. Uses `visited_elements` to guard
// against malformed struct trees.
void PopulateImageAltTextForStructElement(
const MarkedContentIdToImageMap& marked_content_id_image_map,
FPDF_STRUCTELEMENT current_element,
std::set<FPDF_STRUCTELEMENT>* visited_elements);
bool PopulateFormFieldProperties(FPDF_ANNOTATION annot,
FormField* form_field);
// Generates and sends the thumbnail using `send_callback`.
void GenerateAndSendThumbnail(float device_pixel_ratio,
SendThumbnailCallback send_callback);
raw_ptr<PDFiumEngine> engine_;
ScopedFPDFPage page_;
ScopedFPDFTextPage text_page_;
int index_;
int preventing_unload_count_ = 0;
gfx::Rect rect_;
bool calculated_links_ = false;
std::vector<Link> links_;
bool calculated_images_ = false;
std::vector<Image> images_;
bool calculated_annotations_ = false;
std::vector<Highlight> highlights_;
std::vector<TextField> text_fields_;
std::vector<ChoiceField> choice_fields_;
std::vector<Button> buttons_;
bool calculated_page_object_text_run_breaks_ = false;
// The set of character indices on which text runs need to be broken for page
// objects.
std::set<int> page_object_text_run_breaks_;
base::OnceClosure thumbnail_callback_;
bool available_;
};
// Converts page orientations to the PDFium equivalents, as defined by
// FPDF_RenderPage().
constexpr int ToPDFiumRotation(PageOrientation orientation) {
// Could use static_cast<int>(orientation), but using an exhaustive switch
// will trigger an error if we ever change the definition of
// `PageOrientation`.
switch (orientation) {
case PageOrientation::kOriginal:
return 0;
case PageOrientation::kClockwise90:
return 1;
case PageOrientation::kClockwise180:
return 2;
case PageOrientation::kClockwise270:
return 3;
}
}
constexpr uint32_t MakeARGB(unsigned int a,
unsigned int r,
unsigned int g,
unsigned int b) {
return (a << 24) | (r << 16) | (g << 8) | b;
}
} // namespace chrome_pdf
#endif // PDF_PDFIUM_PDFIUM_PAGE_H_