Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add x_tolerance_ratio param to extract_text and similar functions (now properly linted!) #1041

Merged
merged 8 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
linting
  • Loading branch information
afriedman412 committed Nov 1, 2023
commit c2564653e86e00479d23ae95baec4718c60565a6
1 change: 0 additions & 1 deletion pdfplumber/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def to_json(
precision: Optional[int] = None,
indent: Optional[int] = None,
) -> Optional[str]:

data = self.to_dict(object_types)

serialized = Serializer(
Expand Down
1 change: 0 additions & 1 deletion pdfplumber/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def __init__(
include_attrs: Optional[List[str]] = None,
exclude_attrs: Optional[List[str]] = None,
):

self.precision = precision
self.attr_filter = get_attr_filter(
include_attrs=include_attrs, exclude_attrs=exclude_attrs
Expand Down
3 changes: 0 additions & 3 deletions pdfplumber/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def get_page_image(
password: Optional[str],
antialias: bool = False,
) -> PIL.Image.Image:

src: Union[pathlib.Path, BufferedReader, BytesIO]

# If we are working with a file object saved to disk
Expand Down Expand Up @@ -357,7 +356,6 @@ def outline_words(
x_tolerance: T_num = utils.DEFAULT_X_TOLERANCE,
y_tolerance: T_num = utils.DEFAULT_Y_TOLERANCE,
) -> "PageImage":

words = self.page.extract_words(
x_tolerance=x_tolerance, y_tolerance=y_tolerance
)
Expand All @@ -370,7 +368,6 @@ def outline_chars(
fill: T_color = (255, 0, 0, int(255 / 4)),
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":

self.draw_rects(
self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width
)
Expand Down
1 change: 0 additions & 1 deletion pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def open(
repair: bool = False,
gs_path: Optional[Union[str, pathlib.Path]] = None,
) -> "PDF":

stream: Union[BufferedReader, BytesIO]

if repair:
Expand Down
1 change: 0 additions & 1 deletion pdfplumber/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ def _repair(
password: Optional[str] = None,
gs_path: Optional[Union[str, pathlib.Path]] = None,
) -> BytesIO:

executable = gs_path or shutil.which("gs") or shutil.which("gswin32c")
if executable is None: # pragma: nocover
raise Exception(
Expand Down
2 changes: 0 additions & 2 deletions pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ def find_smallest_cell(points: List[T_point], i: int) -> Optional[T_bbox]:
and edge_connects(bottom_right, right_pt)
and edge_connects(bottom_right, below_pt)
):

return (pt[0], pt[1], bottom_right[0], bottom_right[1])
return None

Expand Down Expand Up @@ -397,7 +396,6 @@ def rows(self) -> List[Row]:
return rows

def extract(self, **kwargs: Any) -> List[List[Optional[str]]]:

chars = self.page.chars
table_arr = []

Expand Down
1 change: 0 additions & 1 deletion pdfplumber/utils/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def cluster_objects(
tolerance: T_num,
preserve_order: bool = False,
) -> List[List[R]]:

if not callable(key_fn):
key_fn = itemgetter(key_fn)

Expand Down
16 changes: 7 additions & 9 deletions pdfplumber/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def search(
return_chars: bool = True,
main_group: int = 0,
) -> List[Dict[str, Any]]:

if isinstance(pattern, Pattern):
if regex is False:
raise ValueError(
Expand Down Expand Up @@ -347,8 +346,8 @@ def merge_chars(self, ordered_chars: T_obj_list) -> T_obj:
word[key] = ordered_chars[0][key]

return word
def set_tolerances_from_ratio(self, t: T_obj, axis_range: Iterable='x'):

def set_tolerances_from_ratio(self, t: T_obj, axis_range: Iterable = "x"):
"""
If there is a `tolerance_ratio` for any axis, overrides the tolerance with ratio * size of `t`. Allows for dynamic tolerances to react to different text sizes within a single call.

Expand All @@ -359,11 +358,9 @@ def set_tolerances_from_ratio(self, t: T_obj, axis_range: Iterable='x'):
for i in axis_range:
if self.__getattribute__(f"{i}_tolerance_ratio") is not None:
self.__setattr__(
f"{i}_tolerance",
set_tolerance(
t, self.__getattribute__(f"{i}_tolerance_ratio")
)
)
f"{i}_tolerance",
set_tolerance(t, self.__getattribute__(f"{i}_tolerance_ratio")),
)

def char_begins_new_word(
self,
Expand Down Expand Up @@ -607,5 +604,6 @@ def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:
deduped = yield_unique_chars(chars)
return sorted(deduped, key=chars.index)


def set_tolerance(t, tolerance_ratio):
return tolerance_ratio*(t['bottom'] - t['top'])
return tolerance_ratio * (t["bottom"] - t["top"])
1 change: 0 additions & 1 deletion tests/test_ca_warn_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_objects(self):
assert len(p.images)

def test_parse(self):

rect_x0_clusters = utils.cluster_list(
[r["x0"] for r in self.pdf.pages[1].rects], tolerance=3
)
Expand Down
2 changes: 0 additions & 2 deletions tests/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,12 @@ def determine_if_checked(checkbox, checklines):
"""

for cl in checklines:

if (
checkbox["height"] > (RECT_HEIGHT - RECT_TOLERANCE)
and (checkbox["height"] < RECT_HEIGHT + RECT_TOLERANCE)
and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)
and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)
):

xmatch = False
ymatch = False

Expand Down
9 changes: 5 additions & 4 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,11 @@ def test_decode_psl_list(self):

def test_x_tolerance_ratio(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-987-test.pdf"))
assert pdf.pages[0].extract_text() == 'Big Te xt\nSmall Text'
assert pdf.pages[0].extract_text(x_tolerance=4) == 'Big Te xt\nSmallText'
assert pdf.pages[0].extract_text(x_tolerance_ratio=0.15) == 'Big Text\nSmall Text'
assert pdf.pages[0].extract_text() == "Big Te xt\nSmall Text"
assert pdf.pages[0].extract_text(x_tolerance=4) == "Big Te xt\nSmallText"
assert (
pdf.pages[0].extract_text(x_tolerance_ratio=0.15) == "Big Text\nSmall Text"
)

def test_extract_words(self):
path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
Expand Down Expand Up @@ -97,7 +99,6 @@ def test_extract_words(self):
def test_extract_words_punctuation(self):
path = os.path.join(HERE, "pdfs/test-punkt.pdf")
with pdfplumber.open(path) as pdf:

wordsA = pdf.pages[0].extract_words(split_at_punctuation=True)
wordsB = pdf.pages[0].extract_words(split_at_punctuation=False)
wordsC = pdf.pages[0].extract_words(
Expand Down