Skip to content

Commit

Permalink
[mod] simplify the cookie layer in storage interface
Browse files Browse the repository at this point in the history
  • Loading branch information
asciimoo committed Mar 13, 2018
1 parent 33955be commit f7499d2
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 8 deletions.
50 changes: 48 additions & 2 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ type xmlCallbackContainer struct {
Function XMLCallback
}

type cookieJarSerializer struct {
store storage.Storage
lock *sync.RWMutex
}

var collectorCounter uint32

var (
Expand Down Expand Up @@ -326,7 +331,8 @@ func (c *Collector) Init() {
c.store.Init()
c.MaxBodySize = 10 * 1024 * 1024
c.backend = &httpBackend{}
c.backend.Init(c.store.GetCookieJar())
jar, _ := cookiejar.New(nil)
c.backend.Init(jar)
c.backend.Client.CheckRedirect = c.checkRedirectFunc()
c.wg = &sync.WaitGroup{}
c.lock = &sync.RWMutex{}
Expand Down Expand Up @@ -732,7 +738,7 @@ func (c *Collector) SetStorage(s storage.Storage) error {
return err
}
c.store = s
c.backend.Client.Jar = s.GetCookieJar()
c.backend.Client.Jar = createJar(s)
return nil
}

Expand Down Expand Up @@ -1082,3 +1088,43 @@ func isYesString(s string) bool {
}
return false
}

func createJar(s storage.Storage) http.CookieJar {
return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}}
}

func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) {
j.lock.Lock()
defer j.lock.Unlock()
cookieStr := j.store.Cookies(u)

// Merge existing cookies, new cookies have precendence.
cnew := make([]*http.Cookie, len(cookies))
copy(cnew, cookies)
existing := storage.UnstringifyCookies(cookieStr)
for _, c := range existing {
if !storage.ContainsCookie(cnew, c.Name) {
cnew = append(cnew, c)
}
}
j.store.SetCookies(u, storage.StringifyCookies(cnew))
}

func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie {
cookies := storage.UnstringifyCookies(j.store.Cookies(u))
// Filter.
now := time.Now()
cnew := make([]*http.Cookie, 0, len(cookies))
for _, c := range cookies {
// Drop expired cookies.
if c.RawExpires != "" && c.Expires.Before(now) {
continue
}
// Drop secure cookies if not over https.
if c.Secure && u.Scheme != "https" {
continue
}
cnew = append(cnew, c)
}
return cnew
}
50 changes: 44 additions & 6 deletions storage/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package storage
import (
"net/http"
"net/http/cookiejar"
"net/url"
"strings"
"sync"
)

Expand All @@ -33,9 +35,10 @@ type Storage interface {
// IsVisited returns true if the request was visited before IsVisited
// is called
IsVisited(requestID uint64) (bool, error)
// GetCookieJar returns with cookie jar that implements the
// http.CookieJar interface
GetCookieJar() http.CookieJar
// Cookies retrieves stored cookies for a given host
Cookies(u *url.URL) string
// SetCookies stores cookies for a given host
SetCookies(u *url.URL, cookies string)
}

// InMemoryStorage is the default storage backend of colly.
Expand Down Expand Up @@ -79,12 +82,47 @@ func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) {
return visited, nil
}

// GetCookieJar implements Storage.GetCookieJar()
func (s *InMemoryStorage) GetCookieJar() http.CookieJar {
return s.jar
// Cookies implements Storage.Cookies()
func (s *InMemoryStorage) Cookies(u *url.URL) string {
return StringifyCookies(s.jar.Cookies(u))
}

// SetCookies implements Storage.SetCookies()
func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) {
s.jar.SetCookies(u, UnstringifyCookies(cookies))
}

// Close implements Storage.Close()
func (s *InMemoryStorage) Close() error {
return nil
}

// StringifyCookies serializes list of http.Cookies to string
func StringifyCookies(cookies []*http.Cookie) string {
// Stringify cookies.
cs := make([]string, len(cookies))
for i, c := range cookies {
cs[i] = c.String()
}
return strings.Join(cs, "\n")
}

// UnstringifyCookies deserializes a cookie string to http.Cookies
func UnstringifyCookies(s string) []*http.Cookie {
h := http.Header{}
for _, c := range strings.Split(s, "\n") {
h.Add("Set-Cookie", c)
}
r := http.Response{Header: h}
return r.Cookies()
}

// ContainsCookie checks if a cookie name is represented in cookies
func ContainsCookie(cookies []*http.Cookie, name string) bool {
for _, c := range cookies {
if c.Name == name {
return true
}
}
return false
}

0 comments on commit f7499d2

Please sign in to comment.