Skip to content

Commit

Permalink
chore:format codes
Browse files Browse the repository at this point in the history
  • Loading branch information
geebytes committed Jan 21, 2023
1 parent 7627747 commit 275fbb4
Show file tree
Hide file tree
Showing 37 changed files with 308 additions and 215 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Tegenaria crawl framework

[![Go Report Card](https://goreportcard.com/badge/github.com/wetrycode/tegenaria)](https://goreportcard.com/report/github.com/wetrycode/tegenaria)
[![codecov](https://codecov.io/gh/wetrycode/tegenaria/branch/master/graph/badge.svg?token=XMW3K1JYPB)](https://codecov.io/gh/wetrycode/tegenaria)
[![go workflow](https://github.com/wetrycode/tegenaria/actions/workflows/go.yml/badge.svg)](https://github.com/wetrycode/tegenaria/actions/workflows/go.yml/badge.svg)
[![CodeQL](https://github.com/wetrycode/tegenaria/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/wetrycode/tegenaria/actions/workflows/codeql-analysis.yml)
Expand Down
2 changes: 1 addition & 1 deletion cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,6 @@ func ExecuteCmd(engine *CrawlEngine) {
}
rootCmd.AddCommand(crawlCmd)

crawlCmd.Flags().BoolVarP(&engine.isMaster, "master", "m", false, "Whether to set the current node as the master node,defualt false")
crawlCmd.Flags().BoolVarP(&engine.isMaster, "master", "m", false, "Whether to set the current node as the master node,default false")
rootCmd.Execute()
}
2 changes: 1 addition & 1 deletion cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func TestCmdStart(t *testing.T) {
buf := new(bytes.Buffer)
rootCmd.SetOut(buf)
rootCmd.SetErr(buf)
rootCmd.SetArgs([]string{"crawl","testCmdSpider"})
rootCmd.SetArgs([]string{"crawl", "testCmdSpider"})
convey.So(engine.Execute, convey.ShouldNotPanic)
})
}
3 changes: 2 additions & 1 deletion context.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,12 @@ func WithContextId(ctxId string) ContextOption {
c.CtxId = ctxId
}
}
func WithItemChannelSize(size int) ContextOption{
func WithItemChannelSize(size int) ContextOption {
return func(c *Context) {
c.Items = make(chan *ItemMeta, size)
}
}

// NewContext 从内存池中构建context对象
func NewContext(request *Request, Spider SpiderInterface, opts ...ContextOption) *Context {
ctx := contextPool.Get().(*Context)
Expand Down
3 changes: 1 addition & 2 deletions distributed.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ package tegenaria

import (
"bytes"
"context"
goContext "context"
"encoding/gob"
"fmt"
Expand Down Expand Up @@ -585,7 +584,7 @@ func (w *DistributedWorker) CheckMasterLive() (bool, error) {
result := []*redis.StringCmd{}

for _, member := range members {
result = append(result, pipe.Get(context.TODO(), member))
result = append(result, pipe.Get(goContext.TODO(), member))
}
count, err := w.executeCheck(pipe, result, count)
return count != 0, err
Expand Down
6 changes: 3 additions & 3 deletions distributed_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ func TestAddNodeError(t *testing.T) {
convey.So(err.Error(), convey.ShouldContainSubstring, "sadd add node error")
patch.Reset()

patch = gomonkey.ApplyFunc((*DistributedWorker).addMaster,func (_ *DistributedWorker)error {
patch = gomonkey.ApplyFunc((*DistributedWorker).addMaster, func(_ *DistributedWorker) error {
return errors.New("add master error")

})
err = worker.AddNode()
convey.So(err.Error(), convey.ShouldContainSubstring,"add master error")
convey.So(err.Error(), convey.ShouldContainSubstring, "add master error")
patch.Reset()

})
Expand Down
9 changes: 3 additions & 6 deletions downloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ type SpiderDownloader struct {
ProxyFunc func(req *http.Request) (*url.URL, error)
}


// DownloaderOption 下载器可选参数函数
type DownloaderOption func(d *SpiderDownloader)

Expand Down Expand Up @@ -273,13 +272,11 @@ func (d *SpiderDownloader) Download(ctx *Context) (*Response, error) {
}
// 构建网络请求上下文
var asCtxKey ctxKey = "key"
var timeoutCtx context.Context = nil
var valCtx context.Context = nil
valCtx := context.WithValue(ctx, asCtxKey, ctxValue)
if ctx.Request.Timeout > 0 {
timeoutCtx, _ = context.WithTimeout(ctx, ctx.Request.Timeout)
timeoutCtx, cancel := context.WithTimeout(ctx, ctx.Request.Timeout)
defer cancel()
valCtx = context.WithValue(timeoutCtx, asCtxKey, ctxValue)
} else {
valCtx = context.WithValue(ctx, asCtxKey, ctxValue)
}
req, err := http.NewRequestWithContext(valCtx, string(ctx.Request.Method), u.String(), ctx.Request.BodyReader)
if err != nil {
Expand Down
8 changes: 4 additions & 4 deletions downloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ func TestRequestProxyWithTimeOut(t *testing.T) {
ProxyUrl: proxyServer.URL,
}
defer proxyServer.Close()
resp, err := newRequestDownloadCase("/testTimeout", GET, RequestWithRequestProxy(proxy),RequestWithTimeout(10 * time.Second))
resp, err := newRequestDownloadCase("/testTimeout", GET, RequestWithRequestProxy(proxy), RequestWithTimeout(10*time.Second))
convey.So(err, convey.ShouldBeNil)
convey.So(resp.Status, convey.ShouldAlmostEqual, 200)
convey.So(resp.String(), convey.ShouldContainSubstring, "This is proxy Server.")
Expand All @@ -236,7 +236,7 @@ func TestRequestProxyWithTimeOut(t *testing.T) {
ProxyUrl: proxyServer.URL,
}
defer proxyServer.Close()
resp, err := newRequestDownloadCase("/testTimeout", GET, RequestWithRequestProxy(proxy),RequestWithTimeout(1 * time.Second))
resp, err := newRequestDownloadCase("/testTimeout", GET, RequestWithRequestProxy(proxy), RequestWithTimeout(1*time.Second))
convey.So(err, convey.ShouldNotBeNil)
convey.So(resp, convey.ShouldBeNil)
})
Expand All @@ -252,15 +252,15 @@ func TestRequestHeaders(t *testing.T) {
resp, err := newRequestDownloadCase("/testHeader", GET, RequestWithRequestHeader(headers))
convey.So(err, convey.ShouldBeNil)
convey.So(resp.Status, convey.ShouldAlmostEqual, 200)
content:=resp.String()
content := resp.String()
convey.So(content, convey.ShouldContainSubstring, "value")

})
}

func TestTimeout(t *testing.T) {
convey.Convey("test request timeout", t, func() {
resp, err := newRequestDownloadCase("/testTimeout", GET, RequestWithTimeout(1 * time.Second))
resp, err := newRequestDownloadCase("/testTimeout", GET, RequestWithTimeout(1*time.Second))
convey.So(err, convey.ShouldNotBeNil)
convey.So(resp, convey.ShouldBeNil)

Expand Down
7 changes: 5 additions & 2 deletions dupefilters.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@ type RFPDupeFilterInterface interface {
// DoDupeFilter request去重
DoDupeFilter(ctx *Context) (bool, error)
}

// RFPDupeFilter 去重组件
type RFPDupeFilter struct {
bloomFilter *bloom.BloomFilter
}

// NewRFPDupeFilter 新建去重组件
// bloomP容错率
// bloomN数据规模
Expand Down Expand Up @@ -89,15 +91,16 @@ func (f *RFPDupeFilter) encodeHeader(request *Request) string {
}
return buf.String()
}

// Fingerprint 计算指纹
func (f *RFPDupeFilter) Fingerprint(ctx *Context) ([]byte, error) {
request:=ctx.Request
request := ctx.Request
if request.Url == "" {
return nil, fmt.Errorf("request is nil,maybe it had been free")
}
// get sha128
sha := murmur3.New128()
method:=string(request.Method)
method := string(request.Method)
_, err := io.WriteString(sha, method)
if err != nil {
return nil, err
Expand Down
36 changes: 18 additions & 18 deletions dupefilters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (

func TestDoDupeFilter(t *testing.T) {

convey.Convey("test dupefilter",t,func(){
convey.Convey("test dupefilter", t, func() {
server := newTestServer()
headers := map[string]string{
"Params1": "params1",
Expand All @@ -27,18 +27,18 @@ func TestDoDupeFilter(t *testing.T) {
request3 := NewRequest(server.URL+"/testHeader2", GET, testParser, RequestWithRequestHeader(headers))
ctx3 := NewContext(request3, spider1)

duplicates := NewRFPDupeFilter(0.001,1024*1024)
duplicates := NewRFPDupeFilter(0.001, 1024*1024)
r1, _ := duplicates.DoDupeFilter(ctx1)
convey.So(r1,convey.ShouldBeFalse)
convey.So(r1, convey.ShouldBeFalse)
r2, _ := duplicates.DoDupeFilter(ctx2)
convey.So(r2, convey.ShouldBeTrue)
r3, _ := duplicates.DoDupeFilter(ctx3)
convey.So(r3,convey.ShouldBeFalse)
convey.So(r3, convey.ShouldBeFalse)
})
}

func TestDoBodyDupeFilter(t *testing.T) {
convey.Convey("test body dupefilter",t,func(){
convey.Convey("test body dupefilter", t, func() {
server := newTestServer()
// downloader := NewDownloader()
headers := map[string]string{
Expand All @@ -53,35 +53,35 @@ func TestDoBodyDupeFilter(t *testing.T) {
spider1 := &TestSpider{
NewBaseSpider("testspider", []string{"https://www.baidu.com"}),
}
request1 := NewRequest(server.URL+"/testHeader", GET, testParser, RequestWithRequestHeader(headers),RequestWithRequestBody(body))
request1 := NewRequest(server.URL+"/testHeader", GET, testParser, RequestWithRequestHeader(headers), RequestWithRequestBody(body))
ctx1 := NewContext(request1, spider1)

request2 := NewRequest(server.URL+"/testHeader", GET, testParser, RequestWithRequestHeader(headers),RequestWithRequestBody(body))
request2 := NewRequest(server.URL+"/testHeader", GET, testParser, RequestWithRequestHeader(headers), RequestWithRequestBody(body))
ctx2 := NewContext(request2, spider1)

request3 := NewRequest(server.URL+"/testHeader2", GET, testParser, RequestWithRequestHeader(headers))
ctx3 := NewContext(request3, spider1)

request4 := NewRequest(server.URL+"/testHeader", GET, testParser, RequestWithRequestHeader(headers),RequestWithRequestBody(body))
request4 := NewRequest(server.URL+"/testHeader", GET, testParser, RequestWithRequestHeader(headers), RequestWithRequestBody(body))
ctx4 := NewContext(request4, spider1)

duplicates := NewRFPDupeFilter(0.001,1024*1024)
duplicates := NewRFPDupeFilter(0.001, 1024*1024)

r1, err := duplicates.DoDupeFilter(ctx1)
convey.So(err,convey.ShouldBeNil)
convey.So(r1,convey.ShouldBeFalse)
convey.So(err, convey.ShouldBeNil)
convey.So(r1, convey.ShouldBeFalse)

r2, err := duplicates.DoDupeFilter(ctx2)
convey.So(err,convey.ShouldBeNil)
convey.So(r2,convey.ShouldBeTrue)
convey.So(err, convey.ShouldBeNil)
convey.So(r2, convey.ShouldBeTrue)

r3, err := duplicates.DoDupeFilter(ctx3)
convey.So(err,convey.ShouldBeNil)
convey.So(r3,convey.ShouldBeFalse)
convey.So(err, convey.ShouldBeNil)
convey.So(r3, convey.ShouldBeFalse)

r4, err := duplicates.DoDupeFilter(ctx4)
convey.So(err,convey.ShouldBeNil)
convey.So(r4,convey.ShouldBeTrue)
convey.So(err, convey.ShouldBeNil)
convey.So(r4, convey.ShouldBeTrue)
})

}
8 changes: 3 additions & 5 deletions engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,7 @@ func (e *CrawlEngine) writeCache(ctx *Context) error {
var err error = nil
// 是否进入去重流程
if e.filterDuplicateReq {
var ret bool = false
ret, err = e.rfpDupeFilter.DoDupeFilter(ctx)
ret, err := e.rfpDupeFilter.DoDupeFilter(ctx)
if err != nil {
isDuplicated = true
engineLog.WithField("request_id", ctx.CtxId).Errorf("request unique error %s", err.Error())
Expand Down Expand Up @@ -364,9 +363,8 @@ func (e *CrawlEngine) doDownload(ctx *Context) error {
}
// 增加请求发送量
e.statistic.IncrRequestSent()
var rsp *Response = nil
engineLog.WithField("request_id", ctx.CtxId).Infof("%s request ready to download", ctx.CtxId)
rsp, err = e.downloader.Download(ctx)
rsp, err := e.downloader.Download(ctx)
if err != nil {
return err
}
Expand Down Expand Up @@ -488,7 +486,7 @@ func NewEngine(opts ...EngineOption) *CrawlEngine {
checkMasterLive: func() (bool, error) { return true, nil },
limiter: NewDefaultLimiter(32),
downloader: NewDownloader(),
hooker: NewDefualtHooks(),
hooker: NewDefaultHooks(),
}
for _, o := range opts {
o(Engine)
Expand Down
12 changes: 6 additions & 6 deletions engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,14 @@ func TestEngineStartPanic(t *testing.T) {
ctxManager.Clear()
}
engine := newTestEngine("testStartPanicSpider")
patch:=gomonkey.ApplyFunc((*Statistic).OutputStats,func (_ *Statistic)map[string]uint64 {
patch := gomonkey.ApplyFunc((*Statistic).OutputStats, func(_ *Statistic) map[string]uint64 {
panic("output panic")

})
defer patch.Reset()
f := func(){engine.start("testStartPanicSpider")}
f := func() { engine.start("testStartPanicSpider") }
convey.So(f, convey.ShouldPanic)
convey.So(engine.mutex.TryLock(),convey.ShouldBeTrue)
convey.So(engine.mutex.TryLock(), convey.ShouldBeTrue)
engine.Close()
})

Expand Down Expand Up @@ -412,7 +412,7 @@ func TestParseError(t *testing.T) {
ctx := NewContext(request, spider)
err = engine.doDownload(ctx)
convey.So(err, convey.ShouldBeNil)

err = engine.doParse(ctx)
convey.So(err, convey.ShouldNotBeNil)
convey.So(err.Error(), convey.ShouldContainSubstring, "parse response error")
Expand All @@ -421,7 +421,7 @@ func TestParseError(t *testing.T) {
}
func wokerError(ctx *Context, url string, errMsg string, t *testing.T, patch *gomonkey.Patches, engine *CrawlEngine) {
convey.Convey(fmt.Sprintf("test %s", errMsg), t, func() {
ctxPatch:=gomonkey.ApplyFunc((*Context).Close,func(_ *Context){})
ctxPatch := gomonkey.ApplyFunc((*Context).Close, func(_ *Context) {})
defer func() {
patch.Reset()
ctxPatch.Reset()
Expand Down
Loading

0 comments on commit 275fbb4

Please sign in to comment.