Merge pull request IAmStoxe#17 from IAmStoxe/develop

Develop
oXis · Jul 14, 2020 · bed0097 · bed0097
2 parents 2d3bea3 + ba4df20
commit bed0097
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -68,6 +68,8 @@ Usage of urlgrab:
         The amount of seconds before a request should timeout. (default 10)
   -url string
         The URL where we should start crawling.
+  -urls string
+        A user agent such as (Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0).
   -user-agent string
         A user agent such as (Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0).
   -verbose

diff --git a/main.go b/main.go
@@ -1,6 +1,7 @@
 package main
 
 import (
+	"bufio"
 	"context"
 	"crypto/tls"
 	"encoding/json"
@@ -72,6 +73,7 @@ func main() {
 		suppliedProxy       string
 		threadCount         int
 		timeout             int
+		urlsPath            string
 		useRandomAgent      bool
 		userAgent           string
 		verbose             bool
@@ -97,12 +99,17 @@ func main() {
 	flag.StringVar(&rootDomain, "root-domain", "", "The root domain we should match links against.\nIf not specified it will default to the host of --url.\nExample: --root-domain google.com")
 	flag.StringVar(&startUrl, "url", "", "The URL where we should start crawling.")
 	flag.StringVar(&suppliedProxy, "proxy", "", "The SOCKS5 proxy to utilize (format: socks5://127.0.0.1:8080 OR http://127.0.0.1:8080).\nSupply multiple proxies by separating them with a comma.")
+	flag.StringVar(&urlsPath, "urls", "", "A file path that contains a list of urls to supply as starting urls.\nRequires --root-domain flag.")
 	flag.StringVar(&userAgent, "user-agent", "", "A user agent such as (Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0).")
 
 	flag.Parse()
 
 	setupLogging(verbose)
 
+	if urlsPath != "" && rootDomain == "" {
+		// If loading a bulk file you must provide the rootDomain flag
+		log.Fatal("If using bulk loading you must manually supply the root-domain flag!")
+	}
 	// Ensure that a protocol is specified
 	if !strings.HasPrefix(strings.ToUpper(startUrl), strings.ToUpper("HTTP")) {
 		startUrl = "https://" + startUrl
@@ -131,6 +138,10 @@ func main() {
 		// rootDomain wasn't supplied so use the root domain as the filter
 		// i.e. if abc.xyz.com is supplied, xyz.com will be the root domain
 		splitHost := strings.Split(parsedUrl.Host, ".")
+		if len(splitHost) == 0 {
+			// Failed to parse
+			log.Fatal("Failed to splitHost from %s", parsedUrl.Host)
+		}
 		rootDomainNameTld := splitHost[len(splitHost)-1]
 		rootDomainNameWithoutTld := splitHost[len(splitHost)-2]
 		rootDomainNameWithTld := fmt.Sprintf("%s.%s", rootDomainNameWithoutTld, rootDomainNameTld)
@@ -374,7 +385,7 @@ func main() {
 
 	// Before making a request print "Visiting ..."
 	pageCollector.OnRequest(func(r *colly.Request) {
-
+		// If it's a javascript file, ensure we pass it to the proper connector
 		if strings.HasSuffix(r.URL.Path, ".js") {
 			err2 := jsCollector.Visit(r.URL.String())
 			if err2 != nil {
@@ -383,9 +394,12 @@ func main() {
 
 			// Send to jsCollector
 			jsCollector.Visit(r.URL.String())
-			r.Abort()
 
+			// Cancel the request to ensure we don't process it on this collecotr
+			r.Abort()
+			return
 		} else {
+			// Is it an image or similar? Don't request it.
 			var re = regexp.MustCompile(`(?m).*?\.*(jpg|png|gif|webp|tiff|psd|raw|bmp|heif|ico|css|pdf)(\?.*?|)$`)
 			matchString := re.MatchString(r.URL.Path)
 			if matchString {
@@ -488,8 +502,33 @@ func main() {
 		}
 	}
 
-	// Start scraping on our start URL
-	pageCollector.Visit(startUrl)
+	// If a file path to load urls was supplied load them and visit each
+	// otherwise just visit the given start url
+	if urlsPath != "" {
+		lines, err := readLines(urlsPath)
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		loadedUrls, totalUrls := 0, 0
+
+		for _, line := range lines {
+			totalUrls++
+			u, err := url.Parse(line)
+			if err != nil {
+				log.Errorf("Failed to parse %s as a url", line)
+				continue
+			}
+			loadedUrls++
+			pageCollector.Visit(u.String())
+		}
+		log.Debugf("Loaded %v valid urls out of a total %v from the supplied file.")
+	} else if startUrl != "" {
+		pageCollector.Visit(startUrl)
+	} else {
+		// Neither startUrl or urlsPath were supplied.
+		log.Fatal("You must supply either a starting url or a file path!")
+	}
 
 	// Start both queues
 	pageQueue.Run(pageCollector)
@@ -594,8 +633,8 @@ func writeToJsonFile(outputPath string, data interface{}) {
 
 	_, err = f.WriteString(string(jsonData))
 	if err != nil {
-		panic(err)
 		f.Close()
+		panic(err)
 		return
 	}
 
@@ -615,8 +654,8 @@ func writeLines(outputPath string, data []string) {
 	for i := 0; i < len(data); i++ {
 		_, err := f.WriteString(fmt.Sprintf("%s\n", data[i]))
 		if err != nil {
-			panic(err)
 			f.Close()
+			panic(err)
 			return
 		}
 	}
@@ -628,6 +667,23 @@ func writeLines(outputPath string, data []string) {
 	}
 }
 
+// readLines reads a whole file into memory
+// and returns a slice of its lines.
+func readLines(path string) ([]string, error) {
+	file, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	var lines []string
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		lines = append(lines, scanner.Text())
+	}
+	return lines, scanner.Err()
+}
+
 func arrayContains(arr []string, str string) bool {
 	for _, a := range arr {
 		if a == str {
@@ -666,8 +722,8 @@ func getRenderedSource(url string) string {
 
 	// ensure the second tab is created
 	if err := chromedp.Run(newCtx); err != nil {
-		log.Fatal(err)
 		newCtxCancel()
+		log.Fatal(err)
 	}
 
 	// navigate to a page, and get it's entire HTML