chore:

- Add demo page to the new mkdocs - Set website home page to mkdocs
renz-dev · Jun 22, 2024 · d6182be · d6182be
1 parent 2217904
commit d6182be
Show file tree

Hide file tree

Showing 8 changed files with 332 additions and 18 deletions.
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -631,4 +631,9 @@ def wrap_text(draw, text, font, max_width):
         while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
             line += (words.pop(0) + ' ')
         lines.append(line)
-    return '\n'.join(lines)
+    return '\n'.join(lines)
+
+
+def format_html(html_string):
+    soup = BeautifulSoup(html_string, 'html.parser')
+    return soup.prettify()
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
@@ -140,24 +140,28 @@ def run(
 
             # Check cache first
             cached = None
+            screenshot_data = None
             extracted_content = None
             if not bypass_cache and not self.always_by_pass_cache:
                 cached = get_cached_url(url)
 
             if cached:
                 html = cached[1]
-                extracted_content = cached[2]
+                extracted_content = cached[4]
                 if screenshot:
-                    screenshot = cached[9]
+                    screenshot_data = cached[9]
+                    if not screenshot_data:
+                        cached = None
 
-            else:
+            if not cached or not html:
                 if user_agent:
                     self.crawler_strategy.update_user_agent(user_agent)
                 html = self.crawler_strategy.crawl(url)
                 if screenshot:
-                    screenshot = self.crawler_strategy.take_screenshot()
+                    screenshot_data = self.crawler_strategy.take_screenshot()
+
 
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
+            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
 
     def process_html(
             self,
@@ -197,7 +201,7 @@ def process_html(
 
                 sections = chunking_strategy.chunk(markdown)
                 extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content)
+                extracted_content = json.dumps(extracted_content, indent=4, default=str)
 
                 if verbose:
                     print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
@@ -217,11 +221,11 @@ def process_html(
                     json.dumps(metadata),
                     screenshot=screenshot,
                 )                
-
+            
             return CrawlResult(
                 url=url,
                 html=html,
-                cleaned_html=cleaned_html,
+                cleaned_html=format_html(cleaned_html),
                 markdown=markdown,
                 media=media,
                 links=links,

diff --git a/docs/md/assets/styles.css b/docs/md/assets/styles.css
@@ -15,7 +15,6 @@
     --mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
         Courier New, monospace, serif;
 
-
     --background-color: #151515; /* Dark background */
     --font-color: #eaeaea; /* Light font color for contrast */
     --invert-font-color: #151515; /* Dark color for inverted elements */
@@ -73,11 +72,78 @@ pre, code {
     border-bottom: 1px dashed var(--secondary-color);
 } */
 
-.terminal-mkdocs-main-content{
+.terminal-mkdocs-main-content {
     line-height: var(--global-line-height);
 }
 
-strong, .highlight {
+strong,
+.highlight {
     /* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */
     background-color: #50ffff33;
+}
+
+.terminal-card > header {
+    color: var(--font-color);
+    text-align: center;
+    background-color: var(--progress-bar-background);
+    padding: 0.3em 0.5em;
+}
+.btn.btn-sm {
+    color: var(--font-color);
+    padding: 0.2em 0.5em;
+    font-size: 0.8em;
+}
+
+.loading-message {
+    display: none;
+    margin-top: 20px;
+}
+
+.response-section {
+    display: none;
+    padding-top: 20px;
+}
+
+.tabs {
+    display: flex;
+    flex-direction: column;
+}
+.tab-list {
+    display: flex;
+    padding: 0;
+    margin: 0;
+    list-style-type: none;
+    border-bottom: 1px solid var(--font-color);
+}
+.tab-item {
+    cursor: pointer;
+    padding: 10px;
+    border: 1px solid var(--font-color);
+    margin-right: -1px;
+    border-bottom: none;
+}
+.tab-item:hover,
+.tab-item:focus,
+.tab-item:active {
+    background-color: var(--progress-bar-background);
+}
+.tab-content {
+    display: none;
+    border: 1px solid var(--font-color);
+    border-top: none;
+}
+.tab-content:first-of-type {
+    display: block;
+}
+
+.tab-content header {
+    padding: 0.5em;
+    display: flex; 
+    justify-content: end; 
+    align-items: center;
+    background-color: var(--progress-bar-background);
+}
+.tab-content pre {
+    margin: 0;
+    max-height: 300px; overflow: auto; border:none;
 }
diff --git a/docs/md/demo.md b/docs/md/demo.md
@@ -0,0 +1,198 @@
+# Interactive Demo for Crowler
+<div id="demo">
+    <form id="crawlForm" class="terminal-form">
+        <fieldset>
+            <legend>Enter URL and Options</legend>
+            <div class="form-group">
+                <label for="url">Enter URL:</label>
+                <input type="text" id="url" name="url" required>
+            </div>
+            <div class="form-group">
+                <label for="screenshot">Get Screenshot:</label>
+                <input type="checkbox" id="screenshot" name="screenshot">
+            </div>
+            <div class="form-group">
+                <button class="btn btn-default" type="submit">Submit</button>
+            </div>
+        </fieldset>
+    </form>
+
+    <div id="loading" class="loading-message">
+        <div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
+    </div>
+
+    <section id="response" class="response-section">
+        <h2>Response</h2>
+        <div class="tabs">
+            <ul class="tab-list">
+                <li class="tab-item" onclick="showTab('markdown')">Markdown</li>
+                <li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
+                <li class="tab-item" onclick="showTab('media')">Media</li>
+                <li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
+                <li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
+                <li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
+            </ul>
+            <div class="tab-content" id="tab-markdown">
+                <header>
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-cleanedHtml" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-media" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="mediaContent" class="language-json hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-extractedContent" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-screenshot" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
+                    </div>
+                </header>
+                <pre><img id="screenshotContent" /></pre>
+            </div>
+
+            <div class="tab-content" id="tab-pythonCode" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="pythonCode" class="language-python hljs"></code></pre>
+            </div>
+        </div>
+    </section>
+
+    <script>
+        function showTab(tabId) {
+            const tabs = document.querySelectorAll('.tab-content');
+            tabs.forEach(tab => tab.style.display = 'none');
+            document.getElementById(`tab-${tabId}`).style.display = 'block';
+        }
+
+        function redo(codeBlock, codeText){
+            codeBlock.classList.remove('hljs');
+            codeBlock.removeAttribute('data-highlighted');
+
+            // Set new code and re-highlight
+            codeBlock.textContent = codeText;
+            hljs.highlightBlock(codeBlock);
+        }
+
+        function copyToClipboard(elementId) {
+            const content = document.getElementById(elementId).textContent;
+            navigator.clipboard.writeText(content).then(() => {
+                alert('Copied to clipboard');
+            });
+        }
+
+        function downloadContent(elementId, filename) {
+            const content = document.getElementById(elementId).textContent;
+            const blob = new Blob([content], { type: 'text/plain' });
+            const url = window.URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.style.display = 'none';
+            a.href = url;
+            a.download = filename;
+            document.body.appendChild(a);
+            a.click();
+            window.URL.revokeObjectURL(url);
+            document.body.removeChild(a);
+        }
+
+        function downloadImage(elementId, filename) {
+            const content = document.getElementById(elementId).src;
+            const a = document.createElement('a');
+            a.style.display = 'none';
+            a.href = content;
+            a.download = filename;
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+        }
+
+        document.getElementById('crawlForm').addEventListener('submit', function(event) {
+            event.preventDefault();
+            document.getElementById('loading').style.display = 'block';
+            document.getElementById('response').style.display = 'none';
+
+            const url = document.getElementById('url').value;
+            const screenshot = document.getElementById('screenshot').checked;
+            const data = {
+                urls: [url],
+                bypass_cache: false,
+                word_count_threshold: 5,
+                screenshot: screenshot
+            };
+
+            fetch('/crawl', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify(data)
+            })
+            .then(response => response.json())
+            .then(data => {
+                data = data.results[0]; // Only one URL is requested
+                document.getElementById('loading').style.display = 'none';
+                document.getElementById('response').style.display = 'block';
+                redo(document.getElementById('markdownContent'), data.markdown);
+                redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
+                redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
+                redo(document.getElementById('extractedContentContent'), data.extracted_content);
+                if (screenshot) {
+                    document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
+                }
+                const pythonCode = `
+from crawl4ai.web_crawler import WebCrawler
+
+crawler = WebCrawler()
+crawler.warmup()
+
+result = crawler.run(
+    url='${url}',
+    screenshot=${screenshot}
+)
+print(result)
+                `;
+                redo(document.getElementById('pythonCode'), pythonCode);
+            })
+            .catch(error => {
+                document.getElementById('loading').style.display = 'none';
+                document.getElementById('response').style.display = 'block';
+                document.getElementById('markdownContent').textContent = 'Error: ' + error;
+            });
+        });
+    </script>
+</div>
diff --git a/docs/md/index.md b/docs/md/index.md
@@ -2,6 +2,11 @@
 
 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
 
+
+## Try the [Demo](demo.md)
+
+Just try it now and crawl different pages to see how it works. You can set the links, see the structures of the output, and also view the Python sample code on how to run it. The old demo is available at [/old_demo](/old) where you can see more details.
+
 ## Introduction
 
 Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution.