From ae2b9471591eca366a4f748b05fffeb879195446 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 6 Mar 2024 17:12:28 +0100 Subject: [PATCH] Implement lazy_load() for WikipediaLoader --- .../document_loaders/wikipedia.py | 9 ++++----- .../langchain_community/utilities/wikipedia.py | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/wikipedia.py b/libs/community/langchain_community/document_loaders/wikipedia.py index ea7bd5887a352..2b67c51b2592a 100644 --- a/libs/community/langchain_community/document_loaders/wikipedia.py +++ b/libs/community/langchain_community/document_loaders/wikipedia.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Iterator, Optional from langchain_core.documents import Document @@ -42,12 +42,12 @@ def __init__( self.load_all_available_meta = load_all_available_meta self.doc_content_chars_max = doc_content_chars_max - def load(self) -> List[Document]: + def lazy_load(self) -> Iterator[Document]: """ Loads the query result from Wikipedia into a list of Documents. Returns: - List[Document]: A list of Document objects representing the loaded + A list of Document objects representing the loaded Wikipedia pages. """ client = WikipediaAPIWrapper( @@ -56,5 +56,4 @@ def load(self) -> List[Document]: load_all_available_meta=self.load_all_available_meta, doc_content_chars_max=self.doc_content_chars_max, ) - docs = client.load(self.query) - return docs + yield from client.load(self.query) diff --git a/libs/community/langchain_community/utilities/wikipedia.py b/libs/community/langchain_community/utilities/wikipedia.py index 37dc064ffb4bd..3755368efcee3 100644 --- a/libs/community/langchain_community/utilities/wikipedia.py +++ b/libs/community/langchain_community/utilities/wikipedia.py @@ -1,6 +1,6 @@ """Util that calls Wikipedia.""" import logging -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterator, List, Optional from langchain_core.documents import Document from langchain_core.pydantic_v1 import BaseModel, root_validator @@ -104,13 +104,21 @@ def load(self, query: str) -> List[Document]: Returns: a list of documents. + """ + return list(self.lazy_load(query)) + + def lazy_load(self, query: str) -> Iterator[Document]: + """ + Run Wikipedia search and get the article text plus the meta information. + See + + Returns: a list of documents. + """ page_titles = self.wiki_client.search( query[:WIKIPEDIA_MAX_QUERY_LENGTH], results=self.top_k_results ) - docs = [] for page_title in page_titles[: self.top_k_results]: if wiki_page := self._fetch_page(page_title): if doc := self._page_to_document(page_title, wiki_page): - docs.append(doc) - return docs + yield doc