diff --git a/CHANGELOG.md b/CHANGELOG.md index a54c8306..4747a3ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [0.10.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.4...v0.10.0-beta.5) (2024-05-09) + + +### Bug Fixes + +* fixed bugs for csv and xml ([324e977](https://github.com/VinciGit00/Scrapegraph-ai/commit/324e977b853ecaa55bac4bf86e7cd927f7f43d0d)) + ## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09) diff --git a/pyproject.toml b/pyproject.toml index 90a86467..e8132120 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "scrapegraphai" -version = "0.10.0b4" +version = "0.10.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index 24c19234..178a9c47 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -30,7 +30,7 @@ def _create_graph(self): Creates the graph of nodes representing the workflow for web scraping. """ fetch_node = FetchNode( - input="csv_dir", + input="csv", output=["doc"], ) parse_node = ParseNode( @@ -78,4 +78,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 945dc165..b487f6ae 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph: """ fetch_node = FetchNode( - input="xml_dir", + input="xml", output=["doc"] ) parse_node = ParseNode( @@ -108,4 +108,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index c900b0a2..dfaf8bb6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -1,7 +1,7 @@ """ FetchNode Module """ - +import pandas as pd from typing import List, Optional from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.documents import Document @@ -22,7 +22,7 @@ class FetchNode(BaseNode): Attributes: headless (bool): A flag indicating whether the browser should run in headless mode. verbose (bool): A flag indicating whether to print verbose output during execution. - + Args: input (str): Boolean expression defining the input keys needed from the state. output (List[str]): List of output keys to be updated in the state. @@ -30,11 +30,13 @@ class FetchNode(BaseNode): node_name (str): The unique identifier name for the node, defaulting to "Fetch". """ - def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"): + def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"): super().__init__(node_name, "node", input, output, 1) - self.headless = True if node_config is None else node_config.get("headless", True) - self.verbose = False if node_config is None else node_config.get("verbose", False) + self.headless = True if node_config is None else node_config.get( + "headless", True) + self.verbose = False if node_config is None else node_config.get( + "verbose", False) def execute(self, state): """ @@ -72,6 +74,16 @@ def execute(self, state): loader = PyPDFLoader(source) compressed_document = loader.load() + elif self.input == "csv": + compressed_document = [Document(page_content=pd.read_csv(source), metadata={ + "source": "xml" + })] + elif self.input == "xml": + with open(source, 'r', encoding='utf-8') as f: + data = f.read() + compressed_document = [Document(page_content=data, metadata={ + "source": "xml" + })] elif self.input == "pdf_dir": pass @@ -82,7 +94,7 @@ def execute(self, state): else: if self.node_config is not None and self.node_config.get("endpoint") is not None: - + loader = AsyncChromiumLoader( [source], proxies={"http": self.node_config["endpoint"]},