Skip to content

Commit

Permalink
Merge pull request #770 from riccardoporreca/feature/768-optimize-int…
Browse files Browse the repository at this point in the history
…ernal-hash-checks

Optimize checking internal link hashes in target files
  • Loading branch information
gjtorikian authored Sep 25, 2022
2 parents 026549f + e156faa commit fc5eab8
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ task :proof_readme do
require "redcarpet"

renderer = Redcarpet::Render::HTML.new(\
with_toc_data: true
with_toc_data: true,
)
redcarpet = Redcarpet::Markdown.new(renderer)
html = redcarpet.render(File.read("README.md"))
Expand Down
2 changes: 1 addition & 1 deletion lib/html_proofer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
lib_dir = File.join(File.dirname(__dir__), "lib")
gem_loader = Zeitwerk::Loader.for_gem
gem_loader.inflector.inflect(
"html_proofer" => "HTMLProofer"
"html_proofer" => "HTMLProofer",
)
gem_loader.ignore(File.join(lib_dir, "html-proofer.rb"))
gem_loader.setup
Expand Down
66 changes: 54 additions & 12 deletions lib/html_proofer/url_validator/internal.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,39 @@ def validate
end

def run_internal_link_checker(links)
# collect urls and metadata for hashes to be checked in the same target file
file_paths_hashes_to_check = {}
to_add = []
links.each_pair do |link, matched_files|
links.each_with_index do |(link, matched_files), i|
matched_count_to_log = pluralize(matched_files.count, "reference", "references")
@logger.log(:debug, "(#{i + 1} / #{links.count}) Internal link #{link}: Checking #{matched_count_to_log}")
matched_files.each do |metadata|
url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url])

@runner.current_source = metadata[:source]
@runner.current_filename = metadata[:filename]

unless file_exists?(url)
target_file_path = url.absolute_path
unless file_exists?(target_file_path)
@failed_checks << Failure.new(@runner.current_filename, "Links > Internal",
"internally linking to #{url}, which does not exist", line: metadata[:line], status: nil, content: nil)
to_add << [url, metadata, false]
next
end

unless hash_exists?(url)
hash_exists = hash_exists_for_url?(url)
if hash_exists.nil?
# the hash needs to be checked in the target file, we collect the url and metadata
unless file_paths_hashes_to_check.key?(target_file_path)
file_paths_hashes_to_check[target_file_path] = {}
end
unless file_paths_hashes_to_check[target_file_path].key?(url.hash)
file_paths_hashes_to_check[target_file_path][url.hash] = []
end
file_paths_hashes_to_check[target_file_path][url.hash] << [url, metadata]
next
end
unless hash_exists
@failed_checks << Failure.new(@runner.current_filename, "Links > Internal",
"internally linking to #{url}; the file exists, but the hash '#{url.hash}' does not", line: metadata[:line], status: nil, content: nil)
to_add << [url, metadata, false]
Expand All @@ -48,6 +65,24 @@ def run_internal_link_checker(links)
end
end

# check hashes by target file
@logger.log(:info, "Checking internal link hashes in #{pluralize(file_paths_hashes_to_check.count, "file", "files")}")
file_paths_hashes_to_check.each_with_index do |(file_path, hashes_to_check), i|
hash_count_to_log = pluralize(hashes_to_check.count, "hash", "hashes")
@logger.log(:debug, "(#{i + 1} / #{file_paths_hashes_to_check.count}) Checking #{hash_count_to_log} in #{file_path}")
html = create_nokogiri(file_path)
hashes_to_check.each_pair do |href_hash, url_metadata|
exists = hash_exists_in_html?(href_hash, html)
url_metadata.each do |(url, metadata)|
unless exists
@failed_checks << Failure.new(metadata[:filename], "Links > Internal",
"internally linking to #{url}; the file exists, but the hash '#{href_hash}' does not", line: metadata[:line], status: nil, content: nil)
end
to_add << [url, metadata, exists]
end
end
end

# adding directly to the cache above results in an endless loop
to_add.each do |(url, metadata, exists)|
@cache.add_internal(url.to_s, metadata, exists)
Expand All @@ -56,15 +91,15 @@ def run_internal_link_checker(links)
@failed_checks
end

private def file_exists?(url)
absolute_path = url.absolute_path
return @runner.checked_paths[url.absolute_path] if @runner.checked_paths.key?(absolute_path)
private def file_exists?(absolute_path)
return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)

@runner.checked_paths[url.absolute_path] = File.exist?(absolute_path)
@runner.checked_paths[absolute_path] = File.exist?(absolute_path)
end

# verify the target hash
private def hash_exists?(url)
# verify the hash w/o just based on the URL, w/o looking at the target file
# => returns nil if the has could not be verified
private def hash_exists_for_url?(url)
href_hash = url.hash
return true if blank?(href_hash)
return true unless @runner.options[:check_internal_hash]
Expand All @@ -76,10 +111,18 @@ def run_internal_link_checker(links)
decoded_href_hash = Addressable::URI.unescape(href_hash)
fragment_ids = [href_hash, decoded_href_hash]
# https://www.w3.org/TR/html5/single-page.html#scroll-to-fragid
fragment_ids.include?("top") || !find_fragments(fragment_ids, url).empty?
return true if fragment_ids.include?("top")

nil
end

private def hash_exists_in_html?(href_hash, html)
decoded_href_hash = Addressable::URI.unescape(href_hash)
fragment_ids = [href_hash, decoded_href_hash]
!find_fragments(fragment_ids, html).empty?
end

private def find_fragments(fragment_ids, url)
private def find_fragments(fragment_ids, html)
xpaths = fragment_ids.uniq.flat_map do |frag_id|
escaped_frag_id = "'#{frag_id.split("'").join("', \"'\", '")}', ''"
[
Expand All @@ -89,7 +132,6 @@ def run_internal_link_checker(links)
end
xpaths << XpathFunctions.new

html = create_nokogiri(url.absolute_path)
html.xpath(*xpaths)
end
end
Expand Down

0 comments on commit fc5eab8

Please sign in to comment.