Skip to content

Commit

Permalink
chore: Refactor get_content_of_website_optimized function in utils.py
Browse files Browse the repository at this point in the history
  • Loading branch information
unclecode committed Jun 26, 2024
1 parent 96d1eb0 commit 7ba2142
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 21 deletions.
43 changes: 24 additions & 19 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []}

def process_element(element: element.PageElement) -> None:
def process_element(element: element.PageElement) -> bool:
if isinstance(element, NavigableString):
if isinstance(element, Comment):
element.extract()
return

# if not isinstance(element, element.Tag):
# return
return False

if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
element.decompose()
return
return False

keep_element = False

if element.name == 'a' and element.get('href'):
href = element['href']
Expand All @@ -459,26 +458,23 @@ def process_element(element: element.PageElement) -> None:
links['external'].append(link_data)
else:
links['internal'].append(link_data)
keep_element = True

elif element.name == 'img':
media['images'].append({
'src': element.get('src'),
'alt': element.get('alt'),
'type': 'image'
})
alt_text = element.get('alt')
if alt_text:
element.replace_with(soup.new_string(alt_text))
else:
element.decompose()
return
return True # Always keep image elements

elif element.name in ['video', 'audio']:
media[f"{element.name}s"].append({
'src': element.get('src'),
'alt': element.get('alt'),
'type': element.name
})
return True # Always keep video and audio elements

if element.name != 'pre':
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
Expand All @@ -489,17 +485,26 @@ def process_element(element: element.PageElement) -> None:
elif element.name != 'img':
element.attrs = {}

word_count = len(element.get_text(strip=True).split())
if word_count < word_count_threshold:
element.decompose()
return

# Process children
for child in list(element.children):
process_element(child)
if isinstance(child, NavigableString) and not isinstance(child, Comment):
if len(child.strip()) > 0:
keep_element = True
else:
if process_element(child):
keep_element = True


if not element.contents and not element.get_text(strip=True):
# Check word count
if not keep_element:
word_count = len(element.get_text(strip=True).split())
keep_element = word_count >= word_count_threshold

if not keep_element:
element.decompose()

return keep_element

process_element(body)

def flatten_nested_elements(node):
Expand Down
6 changes: 4 additions & 2 deletions crawl4ai/web_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,10 @@ def run(
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")

if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD
# if word_count_threshold < MIN_WORD_THRESHOLD:
# word_count_threshold = MIN_WORD_THRESHOLD

word_count_threshold = max(word_count_threshold, 0)

# Check cache first
cached = None
Expand Down

0 comments on commit 7ba2142

Please sign in to comment.