Skip to content

Commit

Permalink
Check error of robots.txt GET request
Browse files Browse the repository at this point in the history
  • Loading branch information
OscarScholten committed Apr 10, 2018
1 parent 3439a67 commit bdf3833
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
6 changes: 4 additions & 2 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,15 +567,17 @@ func (c *Collector) isDomainAllowed(domain string) bool {
func (c *Collector) checkRobots(u *url.URL) error {
// var robot *robotstxt.RobotsData
// var ok bool
var err error

c.lock.RLock()
robot, ok := c.robotsMap[u.Host]
c.lock.RUnlock()

if !ok {
// no robots file cached
resp, _ := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt")
resp, err := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt")
if err != nil {
return err
}
robot, err = robotstxt.FromResponse(resp)
if err != nil {
return err
Expand Down
13 changes: 13 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,19 @@ func TestIgnoreRobotsWhenDisallowed(t *testing.T) {

}

func TestConnectionErrorOnRobotsTxtResultsInError(t *testing.T) {
ts := newTestServer()
ts.Close() // immediately close the server to force a connection error

c := NewCollector()
c.IgnoreRobotsTxt = false
err := c.Visit(ts.URL)

if err == nil {
t.Fatal("Error expected")
}
}

func TestEnvSettings(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down

0 comments on commit bdf3833

Please sign in to comment.