Skip to content

Commit

Permalink
Merge pull request #103 from AI4WA/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
PascalSun authored Aug 24, 2024
2 parents 5b3cf04 + e72613f commit 9550c48
Show file tree
Hide file tree
Showing 18 changed files with 221 additions and 68 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ on:
push:
branches:
- main
- develop
pull_request:
branches:
- main
- develop

jobs:
lint:
Expand Down
23 changes: 16 additions & 7 deletions Docs2KG/kg/pdf_layout_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,17 @@ def link_image_to_context(self):
"uuid": str(uuid4()),
"node_properties": {
"text_block_bbox": text_block["bbox"],
"content": text_block["text"],
"content": str(text_block["text"]),
"position": key,
"text_block_number": int(text_block["block_number"]),
},
"children": [],
}
)
nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
nearby_info_dict[key] = {
"content": str(text_block["text"]),
"uuids": [],
}
"""
We also need to loop the nodes within this page
if the text block is highly similar to a content node, then we can link them together
Expand Down Expand Up @@ -352,14 +355,17 @@ def link_table_to_context(self):
"uuid": str(uuid4()),
"node_properties": {
"text_block_bbox": text_block["bbox"],
"content": text_block["text"],
"content": str(text_block["text"]),
"position": key,
"text_block_number": int(text_block["block_number"]),
},
"children": [],
}
)
nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
nearby_info_dict[key] = {
"content": str(text_block["text"]),
"uuids": [],
}
nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
for item in nearby_info:
key = item["node_properties"]["position"]
Expand Down Expand Up @@ -492,7 +498,7 @@ def _create_tree_node(cls, tag: str, node: dict) -> dict:
"""
node_uuid = str(uuid4())
node_properties = {
"content": node.get("content", ""),
"content": str(node.get("content", "")),
"text": json.dumps(node) if tag == "table" else "",
"records": node.get("children", []) if tag == "table" else [],
}
Expand Down Expand Up @@ -566,13 +572,16 @@ def link_image_to_tree_node(self, page_node: dict, nearby_info_dict: dict) -> di
for child in page_node["children"]:
# get the text
content = child["node_properties"].get("content", "")
content = str(content)
nearby_info_dict = self.link_image_to_tree_node(child, nearby_info_dict)
if content.strip() == "":
continue
for key, value in nearby_info_dict.items():
if content.strip() == value["content"].strip():
# get all the value to string to be consistent
value_content = str(value["content"])
if content == value_content:
value["uuids"].append(child["uuid"])
elif self.text_bert_match(content, value["content"]):
elif self.text_bert_match(content, value_content):
value["uuids"].append(child["uuid"])

return nearby_info_dict
Expand Down
8 changes: 5 additions & 3 deletions Docs2KG/kg/web_layout_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@ def __init__(
self.url = url
# extract the domain from the url, if it is http://example.com/sss, then the domain is https://example.com
self.domain = f"{urlparse(url).scheme}://{urlparse(url).netloc}"

self.output_dir = output_dir
self.input_dir = input_dir
self.quoted_url = quote(url, "")
if self.output_dir is None:
self.output_dir = DATA_OUTPUT_DIR / self.quoted_url
self.output_dir.mkdir(parents=True, exist_ok=True)
if self.input_dir is None:
self.input_dir = DATA_INPUT_DIR
self.input_dir.mkdir(parents=True, exist_ok=True)

self.download_html_file()

Expand All @@ -63,7 +65,7 @@ def download_html_file(self):
"""
response = requests.get(self.url)
if response.status_code == 200:
with open(f"{DATA_INPUT_DIR}/index.html", "wb") as f:
with open(f"{self.input_dir}/index.html", "wb") as f:
f.write(response.content)
logger.info(f"Downloaded the HTML file from {self.url}")
else:
Expand All @@ -74,7 +76,7 @@ def create_kg(self):
Create the knowledge graph from the HTML file
"""
with open(f"{DATA_INPUT_DIR}/index.html", "r") as f:
with open(f"{self.input_dir}/index.html", "r") as f:
html_content = f.read()
soup = BeautifulSoup(html_content, "html.parser")
"""
Expand Down
12 changes: 10 additions & 2 deletions Docs2KG/modules/llm/markdown2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,11 @@ def openai_layout_json(self, markdown):
"content": f"Convert the following markdown to JSON format:\n\n{markdown}",
},
]
return self.llm_openai_call(messages)
try:
return self.llm_openai_call(messages)
except Exception as e:
logger.exception(e)
return ""

def openai_content_json(self, markdown: str):
"""
Expand All @@ -418,7 +422,11 @@ def openai_content_json(self, markdown: str):
"content": f"Convert the following markdown to JSON format:\n\n{markdown}",
},
]
return self.llm_openai_call(messages)
try:
return self.llm_openai_call(messages)
except Exception as e:
logger.exception(e)
return ""

def llm_openai_call(self, messages: List[dict]) -> str:
"""
Expand Down
4 changes: 3 additions & 1 deletion Docs2KG/parser/pdf/pdf2blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def extract_df(self, output_csv: bool = False) -> Dict[str, pd.DataFrame]:
texts_df = pd.DataFrame(texts)
images_df = pd.DataFrame(images)
if output_csv:
texts_df.to_csv(self.text_output_dir / "blocks_texts.csv", index=False)
texts_df.to_csv(
self.text_output_dir / "blocks_texts.csv", index=False, escapechar="\\"
)
images_df.to_csv(self.images_output_dir / "blocks_images.csv", index=False)
return {
"texts": texts_df,
Expand Down
4 changes: 2 additions & 2 deletions Docs2KG/parser/pdf/pdf2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def extract2text(self, output_csv: bool = False) -> dict:

df = pd.DataFrame(texts)
if output_csv:
df.to_csv(self.text_output_dir / "text.csv", index=False)
df.to_csv(self.text_output_dir / "text.csv", index=False, escapechar="\\")
return {
"text": text,
"output_file": self.text_output_dir / "text.csv",
Expand Down Expand Up @@ -73,7 +73,7 @@ def extract2markdown(self, output_csv: bool = False) -> dict:
df = pd.DataFrame(markdown_texts)

if output_csv:
df.to_csv(self.text_output_dir / "md.csv", index=False)
df.to_csv(self.text_output_dir / "md.csv", index=False, escapechar="\\")
return {
"md": md_text,
"output_file": self.text_output_dir / "md.csv",
Expand Down
1 change: 1 addition & 0 deletions examples/compose/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ services:
- NEO4J_AUTH=neo4j/testpassword
- NEO4JLABS_PLUGINS=["apoc"]
- NEO4J_apoc_import_file_enabled=true
- NEO4J_apoc_export_file_enabled=true
- NEO4J_dbms_security_procedures_unrestricted=apoc.*
ports:
- 7474:7474
Expand Down
23 changes: 21 additions & 2 deletions examples/demo/dynamic_schema.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import argparse
from pathlib import Path

from Docs2KG.kg.dynamic_schema import DynamicSchema
from Docs2KG.utils.constants import DATA_OUTPUT_DIR

Expand All @@ -7,9 +10,25 @@
- You can hook this into the neo4j after the KG is loaded
2. Human in the loop for the schema merge
"""
kg_json_file = (
DATA_OUTPUT_DIR / "Excellent_Example_Report.pdf" / "kg" / "triplets_kg.json"
args = argparse.ArgumentParser()
args.add_argument(
"--kg_json_file",
type=str,
default=None,
help="The KG JSON File Absolute Path",
)
args.add_argument(
"--kg_json", type=str, default=None, help="The KG JSON File Absolute Path"
)

args = args.parse_args()

if not args.kg_json_file:
kg_json_file = (
DATA_OUTPUT_DIR / "Excellent_Example_Report.pdf" / "kg" / "triplets_kg.json"
)
else:
kg_json_file = Path(args.kg_json_file)
dynamic_schema = DynamicSchema(kg_json_file=kg_json_file)
dynamic_schema.schema_extraction()
dynamic_schema.schema_freq_merge()
Expand Down
29 changes: 24 additions & 5 deletions examples/demo/emails.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import argparse
from pathlib import Path

from Docs2KG.kg.email_layout_kg import EmailLayoutKG
from Docs2KG.kg.semantic_kg import SemanticKG
from Docs2KG.kg.utils.json2triplets import JSON2Triplets
Expand All @@ -6,7 +9,19 @@
from Docs2KG.utils.constants import DATA_INPUT_DIR

if __name__ == "__main__":
email_filename = DATA_INPUT_DIR / "email.eml"
args = argparse.ArgumentParser()
args.add_argument(
"--email_file", type=str, default=None, help="The Email File Absolute Path"
)
args.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
args.add_argument("--neo4j_username", type=str, default="neo4j")
args.add_argument("--neo4j_password", type=str, default="testpassword")

args = args.parse_args()
if not args.email_file:
email_filename = DATA_INPUT_DIR / "email.eml"
else:
email_filename = Path(args.email_file)
email_decomposer = EmailDecompose(email_file=email_filename)
email_decomposer.decompose_email()

Expand All @@ -20,11 +35,15 @@

json_2_triplets = JSON2Triplets(email_decomposer.output_dir)
json_2_triplets.transform()
uri = "bolt://localhost:7687" # if it is a remote graph db, you can change it to the remote uri
username = "neo4j"
password = "testpassword"

json_file_path = email_decomposer.output_dir / "kg" / "triplets_kg.json"

neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
neo4j_loader = Neo4jLoader(
uri=args.neo4j_uri,
username=args.neo4j_username,
password=args.neo4j_password,
json_file_path=json_file_path,
clean=True,
)
neo4j_loader.load_data()
neo4j_loader.close()
36 changes: 30 additions & 6 deletions examples/demo/excel.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import argparse
from pathlib import Path

from Docs2KG.kg.excel_layout_kg import ExcelLayoutKG
from Docs2KG.kg.semantic_kg import SemanticKG
from Docs2KG.kg.utils.json2triplets import JSON2Triplets
Expand All @@ -15,7 +18,24 @@
1. For each sheet, extract the description stuff, and tables will be kept still in csv
2. Then create the kg mainly based on the description
"""
excel_file = DATA_INPUT_DIR / "excel" / "GCP_10002.xlsx"
argparse = argparse.ArgumentParser()
argparse.add_argument(
"--excel_file", type=str, default=None, help="The Excel File Absolute Path"
)
argparse.add_argument(
"--model_name", type=str, default="gpt-3.5-turbo", help="The model name"
)
argparse.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
argparse.add_argument("--neo4j_username", type=str, default="neo4j")
argparse.add_argument("--neo4j_password", type=str, default="testpassword")

args = argparse.parse_args()
# if you want to run this script, you can run it with `python excel.py --excel_file <excel_file>`
if not args.excel_file:
excel_file = DATA_INPUT_DIR / "excel" / "GCP_10002.xlsx"
else:
excel_file = Path(args.excel_file)

excel2table = Excel2Table(excel_file=excel_file)
excel2table.extract_tables_from_excel()

Expand All @@ -27,7 +47,7 @@

sheet_2_metadata = Sheet2Metadata(
excel2markdown.md_csv,
llm_model_name="gpt-3.5-turbo",
llm_model_name=args.model_name,
)
sheet_2_metadata.extract_metadata()

Expand All @@ -43,11 +63,15 @@

json_2_triplets = JSON2Triplets(excel2markdown.output_dir)
json_2_triplets.transform()
uri = "bolt://localhost:7687" # if it is a remote graph db, you can change it to the remote uri
username = "neo4j"
password = "testpassword"

json_file_path = excel2markdown.output_dir / "kg" / "triplets_kg.json"

neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
neo4j_loader = Neo4jLoader(
uri=args.neo4j_uri,
username=args.neo4j_username,
password=args.neo4j_password,
json_file_path=json_file_path,
clean=True,
)
neo4j_loader.load_data()
neo4j_loader.close()
38 changes: 29 additions & 9 deletions examples/demo/pdf_exported.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import argparse
from pathlib import Path

from Docs2KG.kg.pdf_layout_kg import PDFLayoutKG
from Docs2KG.kg.semantic_kg import SemanticKG
from Docs2KG.kg.utils.json2triplets import JSON2Triplets
Expand Down Expand Up @@ -33,12 +36,26 @@
- However, the LLM based looks like much better than the rule based, due to the noise in the PDF
3. Graph Construction
"""
args = argparse.ArgumentParser()
args.add_argument(
"--pdf_file", type=str, default=None, help="The PDF File Absolute Path"
)
args.add_argument(
"--model_name", type=str, default="gpt-3.5-turbo", help="The model name"
)
args.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
args.add_argument("--neo4j_username", type=str, default="neo4j")
args.add_argument("--neo4j_password", type=str, default="testpassword")

args = args.parse_args()
# if you want to run this script, you can run it with `python pdf_exported.py --pdf_file <pdf_file>`
if not args.pdf_file:
pdf_file = DATA_INPUT_DIR / "MESAJ084.pdf"
else:
pdf_file = Path(args.pdf_file)

# you can name your file here
pdf_file = DATA_INPUT_DIR / "historic information.pdf"
output_folder = DATA_OUTPUT_DIR / pdf_file.name

output_folder = DATA_OUTPUT_DIR / "historic information.pdf"
# the output will be default to `DATA_OUTPUT_DIR / "4.pdf" /` folder
scanned_or_exported = get_scanned_or_exported(pdf_file)
if scanned_or_exported == PDF_TYPE_SCANNED:
logger.info("This is a scanned pdf, we will handle it in another demo")
Expand Down Expand Up @@ -86,7 +103,7 @@

markdown2json = LLMMarkdown2Json(
input_md_file,
llm_model_name="gpt-3.5-turbo",
llm_model_name=args.model_name,
)
markdown2json.extract2json()

Expand Down Expand Up @@ -116,11 +133,14 @@
# to get it quickly loaded into Neo4j
# You can do is run the `docker compose -f examples/compose/docker-compose.yml up`
# So we will have a Neo4j instance running, then you can run the `neo4j_connector.py` to load the data
uri = "bolt://localhost:7687" # if it is a remote graph db, you can change it to the remote uri
username = "neo4j"
password = "testpassword"
json_file_path = output_folder / "kg" / "triplets_kg.json"

neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
neo4j_loader = Neo4jLoader(
uri=args.neo4j_uri,
username=args.neo4j_username,
password=args.neo4j_password,
json_file_path=json_file_path,
clean=True,
)
neo4j_loader.load_data()
neo4j_loader.close()
Loading

0 comments on commit 9550c48

Please sign in to comment.