Skip to content

Commit

Permalink
Merge pull request IAmStoxe#17 from IAmStoxe/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
IAmStoxe committed Jul 14, 2020
2 parents 2d3bea3 + ba4df20 commit bed0097
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 7 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ Usage of urlgrab:
The amount of seconds before a request should timeout. (default 10)
-url string
The URL where we should start crawling.
-urls string
A user agent such as (Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0).
-user-agent string
A user agent such as (Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0).
-verbose
Expand Down
70 changes: 63 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"bufio"
"context"
"crypto/tls"
"encoding/json"
Expand Down Expand Up @@ -72,6 +73,7 @@ func main() {
suppliedProxy string
threadCount int
timeout int
urlsPath string
useRandomAgent bool
userAgent string
verbose bool
Expand All @@ -97,12 +99,17 @@ func main() {
flag.StringVar(&rootDomain, "root-domain", "", "The root domain we should match links against.\nIf not specified it will default to the host of --url.\nExample: --root-domain google.com")
flag.StringVar(&startUrl, "url", "", "The URL where we should start crawling.")
flag.StringVar(&suppliedProxy, "proxy", "", "The SOCKS5 proxy to utilize (format: socks5://127.0.0.1:8080 OR http://127.0.0.1:8080).\nSupply multiple proxies by separating them with a comma.")
flag.StringVar(&urlsPath, "urls", "", "A file path that contains a list of urls to supply as starting urls.\nRequires --root-domain flag.")
flag.StringVar(&userAgent, "user-agent", "", "A user agent such as (Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0).")

flag.Parse()

setupLogging(verbose)

if urlsPath != "" && rootDomain == "" {
// If loading a bulk file you must provide the rootDomain flag
log.Fatal("If using bulk loading you must manually supply the root-domain flag!")
}
// Ensure that a protocol is specified
if !strings.HasPrefix(strings.ToUpper(startUrl), strings.ToUpper("HTTP")) {
startUrl = "https://" + startUrl
Expand Down Expand Up @@ -131,6 +138,10 @@ func main() {
// rootDomain wasn't supplied so use the root domain as the filter
// i.e. if abc.xyz.com is supplied, xyz.com will be the root domain
splitHost := strings.Split(parsedUrl.Host, ".")
if len(splitHost) == 0 {
// Failed to parse
log.Fatal("Failed to splitHost from %s", parsedUrl.Host)
}
rootDomainNameTld := splitHost[len(splitHost)-1]
rootDomainNameWithoutTld := splitHost[len(splitHost)-2]
rootDomainNameWithTld := fmt.Sprintf("%s.%s", rootDomainNameWithoutTld, rootDomainNameTld)
Expand Down Expand Up @@ -374,7 +385,7 @@ func main() {

// Before making a request print "Visiting ..."
pageCollector.OnRequest(func(r *colly.Request) {

// If it's a javascript file, ensure we pass it to the proper connector
if strings.HasSuffix(r.URL.Path, ".js") {
err2 := jsCollector.Visit(r.URL.String())
if err2 != nil {
Expand All @@ -383,9 +394,12 @@ func main() {

// Send to jsCollector
jsCollector.Visit(r.URL.String())
r.Abort()

// Cancel the request to ensure we don't process it on this collecotr
r.Abort()
return
} else {
// Is it an image or similar? Don't request it.
var re = regexp.MustCompile(`(?m).*?\.*(jpg|png|gif|webp|tiff|psd|raw|bmp|heif|ico|css|pdf)(\?.*?|)$`)
matchString := re.MatchString(r.URL.Path)
if matchString {
Expand Down Expand Up @@ -488,8 +502,33 @@ func main() {
}
}

// Start scraping on our start URL
pageCollector.Visit(startUrl)
// If a file path to load urls was supplied load them and visit each
// otherwise just visit the given start url
if urlsPath != "" {
lines, err := readLines(urlsPath)
if err != nil {
log.Fatal(err)
}

loadedUrls, totalUrls := 0, 0

for _, line := range lines {
totalUrls++
u, err := url.Parse(line)
if err != nil {
log.Errorf("Failed to parse %s as a url", line)
continue
}
loadedUrls++
pageCollector.Visit(u.String())
}
log.Debugf("Loaded %v valid urls out of a total %v from the supplied file.")
} else if startUrl != "" {
pageCollector.Visit(startUrl)
} else {
// Neither startUrl or urlsPath were supplied.
log.Fatal("You must supply either a starting url or a file path!")
}

// Start both queues
pageQueue.Run(pageCollector)
Expand Down Expand Up @@ -594,8 +633,8 @@ func writeToJsonFile(outputPath string, data interface{}) {

_, err = f.WriteString(string(jsonData))
if err != nil {
panic(err)
f.Close()
panic(err)
return
}

Expand All @@ -615,8 +654,8 @@ func writeLines(outputPath string, data []string) {
for i := 0; i < len(data); i++ {
_, err := f.WriteString(fmt.Sprintf("%s\n", data[i]))
if err != nil {
panic(err)
f.Close()
panic(err)
return
}
}
Expand All @@ -628,6 +667,23 @@ func writeLines(outputPath string, data []string) {
}
}

// readLines reads a whole file into memory
// and returns a slice of its lines.
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()

var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}

func arrayContains(arr []string, str string) bool {
for _, a := range arr {
if a == str {
Expand Down Expand Up @@ -666,8 +722,8 @@ func getRenderedSource(url string) string {

// ensure the second tab is created
if err := chromedp.Run(newCtx); err != nil {
log.Fatal(err)
newCtxCancel()
log.Fatal(err)
}

// navigate to a page, and get it's entire HTML
Expand Down

0 comments on commit bed0097

Please sign in to comment.