Merge pull request #103 from AI4WA/develop

Develop
AI4WA · Aug 24, 2024 · 9550c48 · 9550c48
2 parents 5b3cf04 + e72613f
commit 9550c48
Show file tree

Hide file tree

Showing 18 changed files with 221 additions and 68 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -4,9 +4,11 @@ on:
   push:
     branches:
       - main
+      - develop
   pull_request:
     branches:
       - main
+      - develop
 
 jobs:
   lint:

diff --git a/Docs2KG/kg/pdf_layout_kg.py b/Docs2KG/kg/pdf_layout_kg.py
@@ -269,14 +269,17 @@ def link_image_to_context(self):
                             "uuid": str(uuid4()),
                             "node_properties": {
                                 "text_block_bbox": text_block["bbox"],
-                                "content": text_block["text"],
+                                "content": str(text_block["text"]),
                                 "position": key,
                                 "text_block_number": int(text_block["block_number"]),
                             },
                             "children": [],
                         }
                     )
-                    nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
+                    nearby_info_dict[key] = {
+                        "content": str(text_block["text"]),
+                        "uuids": [],
+                    }
             """
             We also need to loop the nodes within this page
             if the text block is highly similar to a content node, then we can link them together
@@ -352,14 +355,17 @@ def link_table_to_context(self):
                             "uuid": str(uuid4()),
                             "node_properties": {
                                 "text_block_bbox": text_block["bbox"],
-                                "content": text_block["text"],
+                                "content": str(text_block["text"]),
                                 "position": key,
                                 "text_block_number": int(text_block["block_number"]),
                             },
                             "children": [],
                         }
                     )
-                    nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
+                    nearby_info_dict[key] = {
+                        "content": str(text_block["text"]),
+                        "uuids": [],
+                    }
             nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
             for item in nearby_info:
                 key = item["node_properties"]["position"]
@@ -492,7 +498,7 @@ def _create_tree_node(cls, tag: str, node: dict) -> dict:
         """
         node_uuid = str(uuid4())
         node_properties = {
-            "content": node.get("content", ""),
+            "content": str(node.get("content", "")),
             "text": json.dumps(node) if tag == "table" else "",
             "records": node.get("children", []) if tag == "table" else [],
         }
@@ -566,13 +572,16 @@ def link_image_to_tree_node(self, page_node: dict, nearby_info_dict: dict) -> di
         for child in page_node["children"]:
             # get the text
             content = child["node_properties"].get("content", "")
+            content = str(content)
             nearby_info_dict = self.link_image_to_tree_node(child, nearby_info_dict)
             if content.strip() == "":
                 continue
             for key, value in nearby_info_dict.items():
-                if content.strip() == value["content"].strip():
+                # get all the value to string to be consistent
+                value_content = str(value["content"])
+                if content == value_content:
                     value["uuids"].append(child["uuid"])
-                elif self.text_bert_match(content, value["content"]):
+                elif self.text_bert_match(content, value_content):
                     value["uuids"].append(child["uuid"])
 
         return nearby_info_dict

diff --git a/Docs2KG/kg/web_layout_kg.py b/Docs2KG/kg/web_layout_kg.py
@@ -36,13 +36,15 @@ def __init__(
         self.url = url
         # extract the domain from the url, if it is http://example.com/sss, then the domain is https://example.com
         self.domain = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
-
         self.output_dir = output_dir
         self.input_dir = input_dir
         self.quoted_url = quote(url, "")
         if self.output_dir is None:
             self.output_dir = DATA_OUTPUT_DIR / self.quoted_url
             self.output_dir.mkdir(parents=True, exist_ok=True)
+        if self.input_dir is None:
+            self.input_dir = DATA_INPUT_DIR
+            self.input_dir.mkdir(parents=True, exist_ok=True)
 
         self.download_html_file()
 
@@ -63,7 +65,7 @@ def download_html_file(self):
         """
         response = requests.get(self.url)
         if response.status_code == 200:
-            with open(f"{DATA_INPUT_DIR}/index.html", "wb") as f:
+            with open(f"{self.input_dir}/index.html", "wb") as f:
                 f.write(response.content)
             logger.info(f"Downloaded the HTML file from {self.url}")
         else:
@@ -74,7 +76,7 @@ def create_kg(self):
         Create the knowledge graph from the HTML file
 
         """
-        with open(f"{DATA_INPUT_DIR}/index.html", "r") as f:
+        with open(f"{self.input_dir}/index.html", "r") as f:
             html_content = f.read()
         soup = BeautifulSoup(html_content, "html.parser")
         """

diff --git a/Docs2KG/modules/llm/markdown2json.py b/Docs2KG/modules/llm/markdown2json.py
@@ -391,7 +391,11 @@ def openai_layout_json(self, markdown):
                 "content": f"Convert the following markdown to JSON format:\n\n{markdown}",
             },
         ]
-        return self.llm_openai_call(messages)
+        try:
+            return self.llm_openai_call(messages)
+        except Exception as e:
+            logger.exception(e)
+            return ""
 
     def openai_content_json(self, markdown: str):
         """
@@ -418,7 +422,11 @@ def openai_content_json(self, markdown: str):
                 "content": f"Convert the following markdown to JSON format:\n\n{markdown}",
             },
         ]
-        return self.llm_openai_call(messages)
+        try:
+            return self.llm_openai_call(messages)
+        except Exception as e:
+            logger.exception(e)
+            return ""
 
     def llm_openai_call(self, messages: List[dict]) -> str:
         """

diff --git a/Docs2KG/parser/pdf/pdf2blocks.py b/Docs2KG/parser/pdf/pdf2blocks.py
@@ -70,7 +70,9 @@ def extract_df(self, output_csv: bool = False) -> Dict[str, pd.DataFrame]:
         texts_df = pd.DataFrame(texts)
         images_df = pd.DataFrame(images)
         if output_csv:
-            texts_df.to_csv(self.text_output_dir / "blocks_texts.csv", index=False)
+            texts_df.to_csv(
+                self.text_output_dir / "blocks_texts.csv", index=False, escapechar="\\"
+            )
             images_df.to_csv(self.images_output_dir / "blocks_images.csv", index=False)
         return {
             "texts": texts_df,

diff --git a/Docs2KG/parser/pdf/pdf2text.py b/Docs2KG/parser/pdf/pdf2text.py
@@ -40,7 +40,7 @@ def extract2text(self, output_csv: bool = False) -> dict:
 
         df = pd.DataFrame(texts)
         if output_csv:
-            df.to_csv(self.text_output_dir / "text.csv", index=False)
+            df.to_csv(self.text_output_dir / "text.csv", index=False, escapechar="\\")
             return {
                 "text": text,
                 "output_file": self.text_output_dir / "text.csv",
@@ -73,7 +73,7 @@ def extract2markdown(self, output_csv: bool = False) -> dict:
         df = pd.DataFrame(markdown_texts)
 
         if output_csv:
-            df.to_csv(self.text_output_dir / "md.csv", index=False)
+            df.to_csv(self.text_output_dir / "md.csv", index=False, escapechar="\\")
             return {
                 "md": md_text,
                 "output_file": self.text_output_dir / "md.csv",

diff --git a/examples/compose/docker-compose.yml b/examples/compose/docker-compose.yml
@@ -6,6 +6,7 @@ services:
       - NEO4J_AUTH=neo4j/testpassword
       - NEO4JLABS_PLUGINS=["apoc"]
       - NEO4J_apoc_import_file_enabled=true
+      - NEO4J_apoc_export_file_enabled=true
       - NEO4J_dbms_security_procedures_unrestricted=apoc.*
     ports:
       - 7474:7474

diff --git a/examples/demo/dynamic_schema.py b/examples/demo/dynamic_schema.py
@@ -1,3 +1,6 @@
+import argparse
+from pathlib import Path
+
 from Docs2KG.kg.dynamic_schema import DynamicSchema
 from Docs2KG.utils.constants import DATA_OUTPUT_DIR
 
@@ -7,9 +10,25 @@
         - You can hook this into the neo4j after the KG is loaded
     2. Human in the loop for the schema merge
     """
-    kg_json_file = (
-        DATA_OUTPUT_DIR / "Excellent_Example_Report.pdf" / "kg" / "triplets_kg.json"
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--kg_json_file",
+        type=str,
+        default=None,
+        help="The KG JSON File Absolute Path",
+    )
+    args.add_argument(
+        "--kg_json", type=str, default=None, help="The KG JSON File Absolute Path"
     )
+
+    args = args.parse_args()
+
+    if not args.kg_json_file:
+        kg_json_file = (
+            DATA_OUTPUT_DIR / "Excellent_Example_Report.pdf" / "kg" / "triplets_kg.json"
+        )
+    else:
+        kg_json_file = Path(args.kg_json_file)
     dynamic_schema = DynamicSchema(kg_json_file=kg_json_file)
     dynamic_schema.schema_extraction()
     dynamic_schema.schema_freq_merge()

diff --git a/examples/demo/emails.py b/examples/demo/emails.py
@@ -1,3 +1,6 @@
+import argparse
+from pathlib import Path
+
 from Docs2KG.kg.email_layout_kg import EmailLayoutKG
 from Docs2KG.kg.semantic_kg import SemanticKG
 from Docs2KG.kg.utils.json2triplets import JSON2Triplets
@@ -6,7 +9,19 @@
 from Docs2KG.utils.constants import DATA_INPUT_DIR
 
 if __name__ == "__main__":
-    email_filename = DATA_INPUT_DIR / "email.eml"
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--email_file", type=str, default=None, help="The Email File Absolute Path"
+    )
+    args.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
+    args.add_argument("--neo4j_username", type=str, default="neo4j")
+    args.add_argument("--neo4j_password", type=str, default="testpassword")
+
+    args = args.parse_args()
+    if not args.email_file:
+        email_filename = DATA_INPUT_DIR / "email.eml"
+    else:
+        email_filename = Path(args.email_file)
     email_decomposer = EmailDecompose(email_file=email_filename)
     email_decomposer.decompose_email()
 
@@ -20,11 +35,15 @@
 
     json_2_triplets = JSON2Triplets(email_decomposer.output_dir)
     json_2_triplets.transform()
-    uri = "bolt://localhost:7687"  # if it is a remote graph db, you can change it to the remote uri
-    username = "neo4j"
-    password = "testpassword"
+
     json_file_path = email_decomposer.output_dir / "kg" / "triplets_kg.json"
 
-    neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
+    neo4j_loader = Neo4jLoader(
+        uri=args.neo4j_uri,
+        username=args.neo4j_username,
+        password=args.neo4j_password,
+        json_file_path=json_file_path,
+        clean=True,
+    )
     neo4j_loader.load_data()
     neo4j_loader.close()
diff --git a/examples/demo/excel.py b/examples/demo/excel.py
@@ -1,3 +1,6 @@
+import argparse
+from pathlib import Path
+
 from Docs2KG.kg.excel_layout_kg import ExcelLayoutKG
 from Docs2KG.kg.semantic_kg import SemanticKG
 from Docs2KG.kg.utils.json2triplets import JSON2Triplets
@@ -15,7 +18,24 @@
     1. For each sheet, extract the description stuff, and tables will be kept still in csv
     2. Then create the kg mainly based on the description
     """
-    excel_file = DATA_INPUT_DIR / "excel" / "GCP_10002.xlsx"
+    argparse = argparse.ArgumentParser()
+    argparse.add_argument(
+        "--excel_file", type=str, default=None, help="The Excel File Absolute Path"
+    )
+    argparse.add_argument(
+        "--model_name", type=str, default="gpt-3.5-turbo", help="The model name"
+    )
+    argparse.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
+    argparse.add_argument("--neo4j_username", type=str, default="neo4j")
+    argparse.add_argument("--neo4j_password", type=str, default="testpassword")
+
+    args = argparse.parse_args()
+    # if you want to run this script, you can run it with `python excel.py --excel_file <excel_file>`
+    if not args.excel_file:
+        excel_file = DATA_INPUT_DIR / "excel" / "GCP_10002.xlsx"
+    else:
+        excel_file = Path(args.excel_file)
+
     excel2table = Excel2Table(excel_file=excel_file)
     excel2table.extract_tables_from_excel()
 
@@ -27,7 +47,7 @@
 
     sheet_2_metadata = Sheet2Metadata(
         excel2markdown.md_csv,
-        llm_model_name="gpt-3.5-turbo",
+        llm_model_name=args.model_name,
     )
     sheet_2_metadata.extract_metadata()
 
@@ -43,11 +63,15 @@
 
     json_2_triplets = JSON2Triplets(excel2markdown.output_dir)
     json_2_triplets.transform()
-    uri = "bolt://localhost:7687"  # if it is a remote graph db, you can change it to the remote uri
-    username = "neo4j"
-    password = "testpassword"
+
     json_file_path = excel2markdown.output_dir / "kg" / "triplets_kg.json"
 
-    neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
+    neo4j_loader = Neo4jLoader(
+        uri=args.neo4j_uri,
+        username=args.neo4j_username,
+        password=args.neo4j_password,
+        json_file_path=json_file_path,
+        clean=True,
+    )
     neo4j_loader.load_data()
     neo4j_loader.close()
diff --git a/examples/demo/pdf_exported.py b/examples/demo/pdf_exported.py
@@ -1,3 +1,6 @@
+import argparse
+from pathlib import Path
+
 from Docs2KG.kg.pdf_layout_kg import PDFLayoutKG
 from Docs2KG.kg.semantic_kg import SemanticKG
 from Docs2KG.kg.utils.json2triplets import JSON2Triplets
@@ -33,12 +36,26 @@
         - However, the LLM based looks like much better than the rule based, due to the noise in the PDF
     3. Graph Construction
     """
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--pdf_file", type=str, default=None, help="The PDF File Absolute Path"
+    )
+    args.add_argument(
+        "--model_name", type=str, default="gpt-3.5-turbo", help="The model name"
+    )
+    args.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
+    args.add_argument("--neo4j_username", type=str, default="neo4j")
+    args.add_argument("--neo4j_password", type=str, default="testpassword")
+
+    args = args.parse_args()
+    # if you want to run this script, you can run it with `python pdf_exported.py --pdf_file <pdf_file>`
+    if not args.pdf_file:
+        pdf_file = DATA_INPUT_DIR / "MESAJ084.pdf"
+    else:
+        pdf_file = Path(args.pdf_file)
 
-    # you can name your file here
-    pdf_file = DATA_INPUT_DIR / "historic information.pdf"
+    output_folder = DATA_OUTPUT_DIR / pdf_file.name
 
-    output_folder = DATA_OUTPUT_DIR / "historic information.pdf"
-    # the output will be default to `DATA_OUTPUT_DIR / "4.pdf" /` folder
     scanned_or_exported = get_scanned_or_exported(pdf_file)
     if scanned_or_exported == PDF_TYPE_SCANNED:
         logger.info("This is a scanned pdf, we will handle it in another demo")
@@ -86,7 +103,7 @@
 
         markdown2json = LLMMarkdown2Json(
             input_md_file,
-            llm_model_name="gpt-3.5-turbo",
+            llm_model_name=args.model_name,
         )
         markdown2json.extract2json()
 
@@ -116,11 +133,14 @@
         # to get it quickly loaded into Neo4j
         # You can do is run the `docker compose -f examples/compose/docker-compose.yml up`
         # So we will have a Neo4j instance running, then you can run the `neo4j_connector.py` to load the data
-        uri = "bolt://localhost:7687"  # if it is a remote graph db, you can change it to the remote uri
-        username = "neo4j"
-        password = "testpassword"
         json_file_path = output_folder / "kg" / "triplets_kg.json"
 
-        neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
+        neo4j_loader = Neo4jLoader(
+            uri=args.neo4j_uri,
+            username=args.neo4j_username,
+            password=args.neo4j_password,
+            json_file_path=json_file_path,
+            clean=True,
+        )
         neo4j_loader.load_data()
         neo4j_loader.close()