Skip to content

Commit

Permalink
Allow the crawler to skip links with query parameters.
Browse files Browse the repository at this point in the history
This is done by calling 'skip_query_string' on the yielded anemone object.
  • Loading branch information
tilsammans committed Aug 20, 2010
1 parent d670cb2 commit 62ddd8e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
18 changes: 17 additions & 1 deletion lib/anemone/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ def skip_links_like(*patterns)
@skip_link_patterns.concat [patterns].flatten.compact
self
end

#
# Setting this skips all links with a query string (?param=value part).
#
def skip_query_string
@skip_query_string = true
self
end

#
# Add a block to be executed on every Page as they are encountered
Expand Down Expand Up @@ -249,7 +257,15 @@ def visit_link?(link, from_page = nil)
too_deep = false
end

!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
!@pages.has_page?(link) && !skip_link?(link) && !skip_query_string?(link) && allowed && !too_deep
end

#
# Returns +true+ if *link* should not be visited because
# it has a query string and +skip_query_string+ is set.
#
def skip_query_string?(link)
@skip_query_string && link.query
end

#
Expand Down
13 changes: 13 additions & 0 deletions spec/core_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ module Anemone
core.should have(2).pages
core.pages.keys.should_not include(pages[2].url)
end

it "should be able to skip links with query strings" do
pages = []
pages << FakePage.new('0', :links => ['1?foo=1', '2'])
pages << FakePage.new('1?foo=1')
pages << FakePage.new('2')

core = Anemone.crawl(pages[0].url, @opts) do |a|
a.skip_query_string
end

core.should have(2).pages
end

it "should be able to skip links based on a RegEx" do
pages = []
Expand Down

0 comments on commit 62ddd8e

Please sign in to comment.