Skip to content

Commit

Permalink
refactor: Update image description minimum word threshold in get_cont…
Browse files Browse the repository at this point in the history
…ent_of_website_optimized
  • Loading branch information
unclecode committed Aug 2, 2024
1 parent 8ae6c43 commit 9ee9887
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
1 change: 1 addition & 0 deletions crawl4ai/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

# Threshold for the minimum number of word in a HTML tag to be considered
MIN_WORD_THRESHOLD = 1
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1

# Threshold for the Image extraction - Range is 1 to 6
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
Expand Down
4 changes: 3 additions & 1 deletion crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:

soup = BeautifulSoup(html, 'html.parser')
body = soup.body

image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)

if css_selector:
selected_elements = body.select(css_selector)
Expand Down Expand Up @@ -530,7 +532,7 @@ def find_closest_parent_with_useful_text(tag):
if current_tag:
text_content = current_tag.get_text(separator=' ',strip=True)
# Check if the text content has at least word_count_threshold
if len(text_content.split()) >= word_count_threshold:
if len(text_content.split()) >= image_description_min_word_threshold:
return text_content
return None

Expand Down

0 comments on commit 9ee9887

Please sign in to comment.