Skip to content

Commit

Permalink
use pymupdf for faster pdf parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mmz-001 committed Jul 8, 2023
1 parent 66d13ae commit 614ee24
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 19 deletions.
11 changes: 7 additions & 4 deletions knowledge_gpt/core/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import docx2txt
from langchain.docstore.document import Document
from pypdf import PdfReader
import fitz
from hashlib import md5

from abc import abstractmethod, ABC
Expand Down Expand Up @@ -69,14 +69,17 @@ def from_bytes(cls, file: BytesIO) -> "DocxFile":
class PdfFile(File):
@classmethod
def from_bytes(cls, file: BytesIO) -> "PdfFile":
pdf = PdfReader(file)
pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore
docs = []
for i, page in enumerate(pdf.pages):
text = page.extract_text()
for i, page in enumerate(pdf):
text = page.get_text(sort=True)
text = strip_consecutive_newlines(text)
doc = Document(page_content=text.strip())
doc.metadata["page"] = i + 1
docs.append(doc)
# file.read() mutates the file object, which can affect caching
# so we need to reset the file pointer to the beginning
file.seek(0)
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs)


Expand Down
49 changes: 35 additions & 14 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ faiss-cpu = "^1.7.3"
openai = "^0.27.8"
docx2txt = "^0.8"
pillow = "^9.4.0"
pypdf = "^3.3.0"
tenacity = "^8.2.0"
tiktoken = "^0.4.0"
pycryptodome = "^3.18.0"
pymupdf = "^1.22.5"


[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit 614ee24

Please sign in to comment.