Skip to content

Commit

Permalink
add default source for File types
Browse files Browse the repository at this point in the history
The chunk_file() function adds the sources when chunking.
Without adding a default source this will raise an error
when used without chunking
  • Loading branch information
mmz-001 committed Aug 15, 2023
1 parent 2394245 commit d644d4d
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions knowledge_gpt/core/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def from_bytes(cls, file: BytesIO) -> "DocxFile":
text = docx2txt.process(file)
text = strip_consecutive_newlines(text)
doc = Document(page_content=text.strip())
doc.metadata["source"] = "p-1"
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])


Expand All @@ -76,6 +77,7 @@ def from_bytes(cls, file: BytesIO) -> "PdfFile":
text = strip_consecutive_newlines(text)
doc = Document(page_content=text.strip())
doc.metadata["page"] = i + 1
doc.metadata["source"] = f"p-{i+1}"
docs.append(doc)
# file.read() mutates the file object, which can affect caching
# so we need to reset the file pointer to the beginning
Expand All @@ -90,6 +92,7 @@ def from_bytes(cls, file: BytesIO) -> "TxtFile":
text = strip_consecutive_newlines(text)
file.seek(0)
doc = Document(page_content=text.strip())
doc.metadata["source"] = "p-1"
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])


Expand Down

0 comments on commit d644d4d

Please sign in to comment.