Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Async HTML loader and HTML2Text transformer #8036

Merged
merged 4 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e229e34c",
"metadata": {},
"source": [
"# AsyncHtmlLoader\n",
"\n",
"AsyncHtmlLoader loads raw HTML from a list of urls concurrently."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4c8e4dab",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import AsyncHtmlLoader"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e76b5ddc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|############| 2/2 [00:00<00:00, 9.96it/s]\n"
]
}
],
"source": [
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
"loader = AsyncHtmlLoader(urls)\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5dca1c0c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"' news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n<meta property=\"og:image\" content=\"https://a1.espncdn.com/combiner/i?img=%2Fi%2Fespn%2Fespn_logos%2Fespn_red.png\"/>\\n<meta property=\"og:image:width\" content=\"1200\" />\\n<meta property=\"og:image:height\" content=\"630\" />\\n<meta property=\"og:type\" content=\"website\" />\\n<meta name=\"twitter:site\" content=\"espn\" />\\n<meta name=\"twitter:url\" content=\"https://www.espn.com\" />\\n<meta name=\"twitter:title\" content=\"ESPN - Serving Sports Fans. Anytime. Anywhere.\"/>\\n<meta name=\"twitter:description\" content=\"Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n<meta name=\"twitter:card\" content=\"summary\">\\n<meta name=\"twitter:app:name:iphone\" content=\"ESPN\"/>\\n<meta name=\"twitter:app:id:iphone\" content=\"317469184\"/>\\n<meta name=\"twitter:app:name:googleplay\" content=\"ESPN\"/>\\n<meta name=\"twitter:app:id:googleplay\" content=\"com.espn.score_center\"/>\\n<meta name=\"title\" content=\"ESPN - '"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].page_content[1000:2000]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4d024f0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'al\" href=\"https://lilianweng.github.io/posts/2023-06-23-agent/\" />\\n<link crossorigin=\"anonymous\" href=\"/assets/css/stylesheet.min.67a6fb6e33089cb29e856bcc95d7aa39f70049a42b123105531265a0d9f1258b.css\" integrity=\"sha256-Z6b7bjMInLKehWvMldeqOfcASaQrEjEFUxJloNnxJYs=\" rel=\"preload stylesheet\" as=\"style\">\\n<script defer crossorigin=\"anonymous\" src=\"/assets/js/highlight.min.7680afc38aa6b15ddf158a4f3780b7b1f7dde7e91d26f073e6229bb7a0793c92.js\" integrity=\"sha256-doCvw4qmsV3fFYpPN4C3sffd5&#43;kdJvBz5iKbt6B5PJI=\"\\n onload=\"hljs.initHighlightingOnLoad();\"></script>\\n<link rel=\"icon\" href=\"https://lilianweng.github.io/favicon_peach.ico\">\\n<link rel=\"icon\" type=\"image/png\" sizes=\"16x16\" href=\"https://lilianweng.github.io/favicon-16x16.png\">\\n<link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"https://lilianweng.github.io/favicon-32x32.png\">\\n<link rel=\"apple-touch-icon\" href=\"https://lilianweng.github.io/apple-touch-icon.png\">\\n<link rel=\"mask-icon\" href=\"https://lilianweng.github.io/safari-pinned-tab.'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[1].page_content[1000:2000]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fe6e5c82",
"metadata": {},
"source": [
"# html2text\n",
"\n",
"[html2text](https://github.com/Alir3z4/html2text/) is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. \n",
"\n",
"The ASCII also happens to be valid Markdown (a text-to-HTML format)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce77e0cb",
"metadata": {},
"outputs": [],
"source": [
"! pip install html2text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8ca0974b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|############| 2/2 [00:00<00:00, 10.75it/s]\n"
]
}
],
"source": [
"from langchain.document_loaders import AsyncHtmlLoader\n",
"\n",
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
"loader = AsyncHtmlLoader(urls)\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ddf2be97",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_transformers import Html2TextTransformer"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a95a928c",
"metadata": {},
"outputs": [],
"source": [
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
"html2text = Html2TextTransformer()\n",
"docs_transformed = html2text.transform_documents(docs)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "18ef9fe9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" * ESPNFC\\n\\n * X Games\\n\\n * SEC Network\\n\\n## ESPN Apps\\n\\n * ESPN\\n\\n * ESPN Fantasy\\n\\n## Follow ESPN\\n\\n * Facebook\\n\\n * Twitter\\n\\n * Instagram\\n\\n * Snapchat\\n\\n * YouTube\\n\\n * The ESPN Daily Podcast\\n\\n2023 FIFA Women's World Cup\\n\\n## Follow live: Canada takes on Nigeria in group stage of Women's World Cup\\n\\n2m\\n\\nEPA/Morgan Hancock\\n\\n## TOP HEADLINES\\n\\n * Snyder fined $60M over findings in investigation\\n * NFL owners approve $6.05B sale of Commanders\\n * Jags assistant comes out as gay in NFL milestone\\n * O's alone atop East after topping slumping Rays\\n * ACC's Phillips: Never condoned hazing at NU\\n\\n * Vikings WR Addison cited for driving 140 mph\\n * 'Taking his time': Patient QB Rodgers wows Jets\\n * Reyna got U.S. assurances after Berhalter rehire\\n * NFL Future Power Rankings\\n\\n## USWNT AT THE WORLD CUP\\n\\n### USA VS. VIETNAM: 9 P.M. ET FRIDAY\\n\\n## How do you defend against Alex Morgan? Former opponents sound off\\n\\nThe U.S. forward is unstoppable at this level, scoring 121 goals and adding 49\""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs_transformed[0].page_content[1000:2000]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6045d660",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"t's brain,\\ncomplemented by several key components:\\n\\n * **Planning**\\n * Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\\n * Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.\\n * **Memory**\\n * Short-term memory: I would consider all the in-context learning (See Prompt Engineering) as utilizing short-term memory of the model to learn.\\n * Long-term memory: This provides the agent with the capability to retain and recall (infinite) information over extended periods, often by leveraging an external vector store and fast retrieval.\\n * **Tool use**\\n * The agent learns to call external APIs for extra information that is missing from the model weights (often hard to change after pre-training), including current information, code execution c\""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs_transformed[1].page_content[1000:2000]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 2 additions & 0 deletions langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from langchain.document_loaders.airtable import AirtableLoader
from langchain.document_loaders.apify_dataset import ApifyDatasetLoader
from langchain.document_loaders.arxiv import ArxivLoader
from langchain.document_loaders.async_html import AsyncHtmlLoader
from langchain.document_loaders.azlyrics import AZLyricsLoader
from langchain.document_loaders.azure_blob_storage_container import (
AzureBlobStorageContainerLoader,
Expand Down Expand Up @@ -161,6 +162,7 @@

__all__ = [
"AcreomLoader",
"AsyncHtmlLoader",
"AZLyricsLoader",
"AirbyteJSONLoader",
"AirtableLoader",
Expand Down
138 changes: 138 additions & 0 deletions langchain/document_loaders/async_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Web base loader class."""
import asyncio
import logging
import warnings
from typing import Any, Dict, Iterator, List, Optional, Union

import aiohttp
import requests

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)

default_header_template = {
"User-Agent": "",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"
";q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}


class AsyncHtmlLoader(BaseLoader):
"""Loads HTML asynchronously."""

web_paths: List[str]

requests_per_second: int = 2
"""Max number of concurrent requests to make."""

requests_kwargs: Dict[str, Any] = {}
"""kwargs for requests"""

raise_for_status: bool = False
"""Raise an exception if http status code denotes an error."""

def __init__(
self,
web_path: Union[str, List[str]],
header_template: Optional[dict] = None,
verify_ssl: Optional[bool] = True,
proxies: Optional[dict] = None,
):
"""Initialize with webpage path."""

# TODO: Deprecate web_path in favor of web_paths, and remove this
# left like this because there are a number of loaders that expect single
# urls
if isinstance(web_path, str):
self.web_paths = [web_path]
elif isinstance(web_path, List):
self.web_paths = web_path

headers = header_template or default_header_template
if not headers.get("User-Agent"):
try:
from fake_useragent import UserAgent

headers["User-Agent"] = UserAgent().random
except ImportError:
logger.info(
"fake_useragent not found, using default user agent."
"To get a realistic header for requests, "
"`pip install fake_useragent`."
)

self.session = requests.Session()
self.session.headers = dict(headers)
self.session.verify = verify_ssl

if proxies:
self.session.proxies.update(proxies)

async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str:
async with aiohttp.ClientSession() as session:
for i in range(retries):
try:
async with session.get(
url,
headers=self.session.headers,
ssl=None if self.session.verify else False,
) as response:
return await response.text()
except aiohttp.ClientConnectionError as e:
if i == retries - 1:
raise
else:
logger.warning(
f"Error fetching {url} with attempt "
f"{i + 1}/{retries}: {e}. Retrying..."
)
await asyncio.sleep(cooldown * backoff**i)
raise ValueError("retry count exceeded")

async def _fetch_with_rate_limit(
self, url: str, semaphore: asyncio.Semaphore
) -> str:
async with semaphore:
return await self._fetch(url)

async def fetch_all(self, urls: List[str]) -> Any:
"""Fetch all urls concurrently with rate limiting."""
semaphore = asyncio.Semaphore(self.requests_per_second)
tasks = []
for url in urls:
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
tasks.append(task)
try:
from tqdm.asyncio import tqdm_asyncio

return await tqdm_asyncio.gather(
*tasks, desc="Fetching pages", ascii=True, mininterval=1
)
except ImportError:
warnings.warn("For better logging of progress, `pip install tqdm`")
return await asyncio.gather(*tasks)

def lazy_load(self) -> Iterator[Document]:
"""Lazy load text from the url(s) in web_path."""
for doc in self.load():
yield doc

def load(self) -> List[Document]:
"""Load text from the url(s) in web_path."""

results = asyncio.run(self.fetch_all(self.web_paths))
docs = []
for i, text in enumerate(results):
metadata = {"source": self.web_paths[i]}
docs.append(Document(page_content=text, metadata=metadata))

return docs
Loading