Push async version last changes for merge to main branch

Darshan2104 · Sep 24, 2024 · 4d48bd3 · 4d48bd3
1 parent d628bc4
commit 4d48bd3
Show file tree

Hide file tree

Showing 61 changed files with 6,224 additions and 896 deletions.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -6,20 +6,21 @@ We would like to thank the following people for their contributions to Crawl4AI:
 
 - [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer
 - [Nasrin](https://github.com/ntohidi) - Project Manager and Developer
+- [Aravind Karnam](https://github.com/aravindkarnam) - Developer
 
 ## Community Contributors
 
-- [Aravind Karnam](https://github.com/aravindkarnam) - Developed textual description extraction feature
 - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors
 - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies
+- [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for
+- [datehoer](https://github.com/datehoer) - Add browser prxy support
 
 ## Other Contributors
 
 - [Gokhan](https://github.com/gkhngyk) 
 - [Shiv Kumar](https://github.com/shivkumar0757)
 - [QIN2DIM](https://github.com/QIN2DIM)
 
-
 ## Acknowledgements
 
 We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI Async Version 🕷️🤖
+# Crawl4AI 0.3.0 Async Version 🕷️🤖
 
 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -43,18 +43,78 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
 
 ## Installation 🛠️
 
+Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
+
 ### Using pip 🐍
+
+Choose the installation option that best fits your needs:
+
+#### Basic Installation
+
+For basic web crawling and scraping tasks:
+
+```bash
+pip install crawl4ai
+```
+
+#### Installation with PyTorch
+
+For advanced text clustering (includes CosineSimilarity cluster strategy):
+
+```bash
+pip install crawl4ai[torch]
+```
+
+#### Installation with Transformers
+
+For text summarization and Hugging Face models:
+
+```bash
+pip install crawl4ai[transformer]
+```
+
+#### Installation with Synchronous Version
+
+If you need the synchronous version using Selenium:
+
 ```bash
-virtualenv venv
-source venv/bin/activate
-pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
+pip install crawl4ai[sync]
+```
+
+#### Installation with Cosine Similarity
+
+For using the cosine similarity strategy:
+
+```bash
+pip install crawl4ai[cosine]
+```
+
+#### Full Installation
+
+For all features:
+
+```bash
+pip install crawl4ai[all]
+```
+
+After installation, run the following command to install Playwright dependencies:
+
+```bash
+playwright install
+```
+
+If you've installed the "torch", "transformer", or "all" options, it's recommended to run:
+
+```bash
+crawl4ai-download-models
 ```
 
 ### Using Docker 🐳
 
 ```bash
 # For Mac users (M1/M2)
-# docker build --platform linux/amd64 -t crawl4ai .
+docker build --platform linux/amd64 -t crawl4ai .
+# For other users
 docker build -t crawl4ai .
 docker run -d -p 8000:80 crawl4ai
 ```
@@ -66,6 +126,8 @@ docker pull unclecode/crawl4ai:latest
 docker run -d -p 8000:80 unclecode/crawl4ai:latest
 ```
 
+For more detailed installation instructions and options, please refer to our [Installation Guide](https://crawl4ai.com/mkdocs/installation).
+
 ## Quick Start 🚀
 
 ```python

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
@@ -2,7 +2,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult
 
-__version__ = "0.2.77"
+__version__ = "0.3.0"
 
 __all__ = [
     "WebCrawler",

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -52,6 +52,7 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
         self.use_cached_html = use_cached_html
         self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
         self.proxy = kwargs.get("proxy")
+        self.headless = kwargs.get("headless", True)
         self.headers = {}
         self.sessions = {}
         self.session_ttl = 1800 
@@ -80,7 +81,7 @@ async def start(self):
             self.playwright = await async_playwright().start()
         if self.browser is None:
             browser_args = {
-                "headless": True,
+                "headless": self.headless,
                 # "headless": False,
                 "args": [
                     "--disable-gpu",
@@ -145,6 +146,31 @@ def _cleanup_expired_sessions(self):
                             if current_time - last_used > self.session_ttl]
         for sid in expired_sessions:
             asyncio.create_task(self.kill_session(sid))
+
+
+    async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
+        wrapper_js = f"""
+        async () => {{
+            const userFunction = {user_wait_function};
+            const startTime = Date.now();
+            while (true) {{
+                if (await userFunction()) {{
+                    return true;
+                }}
+                if (Date.now() - startTime > {timeout}) {{
+                    throw new Error('Timeout waiting for condition');
+                }}
+                await new Promise(resolve => setTimeout(resolve, 100));
+            }}
+        }}
+        """
+
+        try:
+            await page.evaluate(wrapper_js)
+        except TimeoutError:
+            raise TimeoutError(f"Timeout after {timeout}ms waiting for condition")
+        except Exception as e:
+            raise RuntimeError(f"Error in wait condition: {str(e)}")
 
     async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
         response_headers = {}
@@ -196,14 +222,17 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
                 # Get status code and headers
                 status_code = response.status
                 response_headers = response.headers
+            else:
+                status_code = 200
+                response_headers = {}
 
             await page.wait_for_selector('body')
             await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
 
             js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
             if js_code:
                 if isinstance(js_code, str):
-                    await page.evaluate(js_code)
+                    r = await page.evaluate(js_code)
                 elif isinstance(js_code, list):
                     for js in js_code:
                         await page.evaluate(js)
@@ -212,6 +241,37 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
                 await page.wait_for_load_state('networkidle')
                 # Check for on execution even
                 await self.execute_hook('on_execution_started', page)
+
+            # New code to handle the wait_for parameter
+            # Example usage:
+            # await crawler.crawl(
+            #     url,
+            #     js_code="// some JavaScript code",
+            #     wait_for="""() => {
+            #         return document.querySelector('#my-element') !== null;
+            #     }"""
+            # )
+            # Example of using a CSS selector:
+            # await crawler.crawl(
+            #     url,
+            #     wait_for="#my-element"
+            # )
+            wait_for = kwargs.get("wait_for")
+            if wait_for:
+                try:
+                    await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
+                except Exception as e:
+                    raise RuntimeError(f"Custom wait condition failed: {str(e)}")                
+                # try:
+                #     await page.wait_for_function(wait_for)
+                #     # if callable(wait_for):
+                #     #     await page.wait_for_function(wait_for)
+                #     # elif isinstance(wait_for, str):
+                #     #     await page.wait_for_selector(wait_for)
+                #     # else:
+                #     #     raise ValueError("wait_for must be either a callable or a CSS selector string")
+                # except Error as e:
+                #     raise Error(f"Custom wait condition failed: {str(e)}")
 
             html = await page.content()
             page = await self.execute_hook('before_return_html', page, html)
@@ -246,6 +306,49 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
         # except Exception as e:
         #     raise Exception(f"Failed to crawl {url}: {str(e)}")
 
+    async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = None, wait_for_css: str = None) -> AsyncCrawlResponse:
+        """
+        Execute JavaScript code in a specific session and optionally wait for a condition.
+        
+        :param session_id: The ID of the session to execute the JS code in.
+        :param js_code: The JavaScript code to execute.
+        :param wait_for_js: JavaScript condition to wait for after execution.
+        :param wait_for_css: CSS selector to wait for after execution.
+        :return: AsyncCrawlResponse containing the page's HTML and other information.
+        :raises ValueError: If the session does not exist.
+        """
+        if not session_id:
+            raise ValueError("Session ID must be provided")
+
+        if session_id not in self.sessions:
+            raise ValueError(f"No active session found for session ID: {session_id}")
+
+        context, page, last_used = self.sessions[session_id]
+
+        try:
+            await page.evaluate(js_code)
+
+            if wait_for_js:
+                await page.wait_for_function(wait_for_js)
+
+            if wait_for_css:
+                await page.wait_for_selector(wait_for_css)
+
+            # Get the updated HTML content
+            html = await page.content()
+
+            # Get response headers and status code (assuming these are available)
+            response_headers = await page.evaluate("() => JSON.stringify(performance.getEntriesByType('resource')[0].responseHeaders)")
+            status_code = await page.evaluate("() => performance.getEntriesByType('resource')[0].responseStatus")
+
+            # Update the last used time for this session
+            self.sessions[session_id] = (context, page, time.time())
+
+            return AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+        except Error as e:
+            raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
+
+
     async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
         semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
         semaphore = asyncio.Semaphore(semaphore_count)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -80,6 +80,7 @@ async def arun(
 
             word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
 
+            async_response : AsyncCrawlResponse = None
             cached = None
             screenshot_data = None
             extracted_content = None
@@ -125,8 +126,8 @@ async def arun(
                 async_response=async_response,
                 **kwargs,
             )
-            crawl_result.status_code = async_response.status_code
-            crawl_result.responser_headers = async_response.response_headers
+            crawl_result.status_code = async_response.status_code if async_response else 200
+            crawl_result.responser_headers = async_response.response_headers if async_response else {}
             crawl_result.success = bool(html)
             crawl_result.session_id = kwargs.get("session_id", None)
             return crawl_result
@@ -224,11 +225,11 @@ async def aprocess_html(
             if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
                 extraction_strategy.verbose = verbose
                 extracted_content = extraction_strategy.run(url, [html])
-                extracted_content = json.dumps(extracted_content, indent=4, default=str)
+                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
             else:
                 sections = chunking_strategy.chunk(markdown)
                 extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content, indent=4, default=str)
+                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
 
         if verbose:
             print(

diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py
@@ -50,7 +50,16 @@ def _get_content_of_website_optimized(self, url: str, html: str, word_count_thre
         if css_selector:
             selected_elements = body.select(css_selector)
             if not selected_elements:
-                raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
+                return {
+                    'markdown': '',
+                    'cleaned_html': '',
+                    'success': True,
+                    'media': {'images': [], 'videos': [], 'audios': []},
+                    'links': {'internal': [], 'external': []},
+                    'metadata': {},
+                    'message': f"No elements found for CSS selector: {css_selector}"
+                }
+                # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
             body = soup.new_tag('div')
             for el in selected_elements:
                 body.append(el)

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
@@ -258,6 +258,18 @@ def crawl(self, url: str, **kwargs) -> str:
                         lambda driver: driver.execute_script("return document.readyState") == "complete"
                     )
 
+            # Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
+            wait_for = kwargs.get('wait_for', False)
+            if wait_for:
+                if callable(wait_for):
+                    print("[LOG] 🔄 Waiting for condition...")
+                    WebDriverWait(self.driver, 20).until(wait_for)
+                else:
+                    print("[LOG] 🔄 Waiting for condition...")
+                    WebDriverWait(self.driver, 20).until(
+                        EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
+                    ) 
+
             if not can_not_be_done_headless:
                 html = sanitize_input_encode(self.driver.page_source)
             self.driver = self.execute_hook('before_return_html', self.driver, html)

diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
@@ -80,7 +80,6 @@ def load_bge_small_en_v1_5():
     model, device = set_model_device(model)
     return tokenizer, model
 
-
 @lru_cache()
 def load_text_classifier():
     from transformers import AutoTokenizer, AutoModelForSequenceClassification
@@ -147,7 +146,6 @@ def load_nltk_punkt():
         nltk.download('punkt')
     return nltk.data.find('tokenizers/punkt')
 
-
 @lru_cache()
 def load_spacy_model():
     import spacy

diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
@@ -201,7 +201,7 @@ def process_html(
 
                 sections = chunking_strategy.chunk(markdown)
                 extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content, indent=4, default=str)
+                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
 
                 if verbose:
                     print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")

diff --git a/docs/.DS_Store b/docs/.DS_Store