Skip to content

Commit

Permalink
moved score threshold to config.py & replaced the separator for tag.g…
Browse files Browse the repository at this point in the history
…et_text in find_closest_parent_with_useful_text fn from period(.) to space( ) to keep the text more neutral.
  • Loading branch information
Aravind Karnam authored and Aravind Karnam committed Jul 21, 2024
1 parent e5ecf29 commit cf6c835
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
10 changes: 10 additions & 0 deletions crawl4ai/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,13 @@

# Threshold for the minimum number of word in a HTML tag to be considered
MIN_WORD_THRESHOLD = 1

# Threshold for the Image extraction - Range is 1 to 6
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
# to each image based on the following aspects.
# If either height or width exceeds 150px
# If image size is greater than 10Kb
# If alt property is set
# If image format is in jpg, png or webp
# If image is in the first half of the total images extracted from the page
IMAGE_SCORE_THRESHOLD = 2
4 changes: 2 additions & 2 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def find_closest_parent_with_useful_text(tag):
current_tag = current_tag.parent
# Get the text content of the parent tag
if current_tag:
text_content = current_tag.get_text(separator='. ',strip=True)
text_content = current_tag.get_text(separator=' ',strip=True)
# Check if the text content has at least word_count_threshold
if len(text_content.split()) >= word_count_threshold:
return text_content
Expand All @@ -535,7 +535,7 @@ def find_closest_parent_with_useful_text(tag):
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
return None
score = score_image_for_usefulness(img, url, index, total_images)
if score <= 2:
if score <= IMAGE_SCORE_THRESHOLD:
return None
return {
'src': img.get('src', ''),
Expand Down

0 comments on commit cf6c835

Please sign in to comment.