Add in README file

Ennesie-DelaCruz · Aug 9, 2022 · c5db8eb · c5db8eb
1 parent cb9c6bc
commit c5db8eb
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 60 deletions.
diff --git a/search/README.md b/search/README.md
@@ -0,0 +1,45 @@
+# Project Overview
+
+In this project, we'll build a search engine that uses filtering to reorder results.  The engine will get results from the Google API, store them, then rank them based on filters we define.  We'll end up with a basic search page and results list.
+
+We'll use Pycharm, a common python IDE, to write our code and run it.
+
+**Project Steps**
+
+* Setup a programmable search engine [Custom Search API](https://developers.google.com/custom-search/v1/introduction)
+* Create an [API key](https://console.cloud.google.com/apis/credentials) for the engine
+* Create a module to search using the API
+* Create a Flask application to search and render results
+* Create filters to re-rank results before displaying them
+
+
+## Code
+
+You can find the code for this project [here](https://github.com/dataquestio/project-walkthroughs/tree/master/search).
+
+File overview:
+
+* `app.py` - the web interface
+* `filter.py` - the code to filter results
+* `search.py` - code to get the search results
+* `settings.py` - settings needed by the other files
+* `storage.py` - code to save the results to a database
+
+# Local Setup
+
+## Installation
+
+To follow this project, please install the following locally:
+
+* Python 3.9+
+* Required Python packages (`pip install -r requirements.txt`)
+
+### Other setup
+
+You will need to create a programmable search engine and get an API key by following [these directions](https://developers.google.com/custom-search/v1/introduction).
+
+### Other files
+
+You'll need to download a list of ad and tracker urls from [here](https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt).  We'll use this to filter out bad domains.  Please save it as `blacklist.txt`.
+
+I also recommend copying the [storage.py](https://github.com/dataquestio/project-walkthroughs/blob/master/search/storage.py) file into your directory before we start the project.
diff --git a/search/filter.py b/search/filter.py
@@ -1,15 +1,20 @@
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 from settings import *
-import re
 
-def count_trackers(row):
+with open("blacklist.txt") as f:
+    domains = set(f.read().split("\n"))
+
+def tracker_urls(row):
     soup = BeautifulSoup(row["html"])
-    domain = urlparse(row["link"]).hostname
     scripts = soup.find_all("script", {"src": True})
     srcs = [s.get("src") for s in scripts]
-    bad_srcs = [s for s in srcs if ".." not in s and domain not in s and "cdn" not in s]
-    return len(bad_srcs)
+
+    links = soup.find_all("a", {"href": True})
+    href = [l.get("href") for l in links]
+
+    all_domains = [urlparse(s).hostname for s in srcs + href]
+    return len([a for a in all_domains if a in domains])
 
 def get_page_content(row):
     soup = BeautifulSoup(row["html"])
@@ -20,29 +25,23 @@ class Filter():
     def __init__(self, results):
         self.filtered = results.copy()
 
-    def js_filter(self):
-        tracker_count = self.filtered.apply(count_trackers, axis=1)
+    def tracker_filter(self):
+        tracker_count = self.filtered.apply(tracker_urls, axis=1)
         tracker_count[tracker_count > tracker_count.median()] = RESULT_COUNT
-        self.filtered["rank"] += tracker_count
+        self.filtered["rank"] += tracker_count * 2
 
     def content_filter(self):
         page_content = self.filtered.apply(get_page_content, axis=1)
         word_count = page_content.apply(lambda x: len(x.split(" ")))
-        median = word_count.median()
-        word_count /= median
+
+        word_count /= word_count.median()
         word_count[word_count <= .5] = RESULT_COUNT
+        word_count[word_count != RESULT_COUNT] = 0
         self.filtered["rank"] += word_count
 
-    def year_filter(self):
-        titles = self.filtered["title"]
-        year_in_title = titles.apply(lambda x: len(re.findall(r"20\d{2}", x)))
-        year_in_title[year_in_title > 0] = RESULT_COUNT
-        self.filtered["rank"] += year_in_title
-
     def filter(self):
-        self.js_filter()
+        self.tracker_filter()
         self.content_filter()
-        self.year_filter()
         self.filtered = self.filtered.sort_values("rank", ascending=True)
         self.filtered["rank"] = self.filtered["rank"].round()
         return self.filtered
diff --git a/search/requirements.txt b/search/requirements.txt
@@ -1,4 +1,5 @@
 flask
 pandas
 requests
-beautifulsoup4
+beautifulsoup4
+adblockparser
diff --git a/search/search.py b/search/search.py
@@ -4,29 +4,18 @@
 import pandas as pd
 from storage import DBStorage
 from datetime import datetime
-import time
-
-"""
-def search_scrape(query):
-    res = list(gsearch(query, num=RESULT_COUNT))
-    res_df = pd.DataFrame({"link": res, "rank": list(range(1, len(res) + 1))})
-    return res_df
-"""
-
-def format_query(query):
-    return query.replace(" ", "+")
-
-def search_url(query, start=1):
-    query = format_query(query)
-    return SEARCH_URL.format(key=SEARCH_KEY, cx=SEARCH_ID, query=query, start=start)
+from urllib.parse import quote_plus
 
 def search_api(query, pages=int(RESULT_COUNT/10)):
-    query = format_query(query)
-
     results = []
     for i in range(0, pages):
         start = i*10+1
-        url = search_url(query, start=start)
+        url = SEARCH_URL.format(
+            key=SEARCH_KEY,
+            cx=SEARCH_ID,
+            query=quote_plus(query),
+            start=start
+        )
         response = requests.get(url)
         data = response.json()
         results += data["items"]
@@ -35,26 +24,6 @@ def search_api(query, pages=int(RESULT_COUNT/10)):
     res_df = res_df[["link", "rank", "snippet", "title"]]
     return res_df
 
-"""
-def scrape_page(links):
-    html = []
-    with sync_playwright() as p:
-        browser = p.chromium.launch()
-        page = browser.new_page()
-        for link in links:
-            print(link)
-            try:
-                page.goto(link, wait_until="load")
-                html.append(page.content())
-            except PlaywrightTimeoutError:
-                html.append("")
-            except PlaywrightError:
-                time.sleep(.5)
-                html.append(page.content())
-        browser.close()
-    return html
-"""
-
 def scrape_page(links):
     html = []
     for link in links:
@@ -72,8 +41,6 @@ def __init__(self):
         self.storage = DBStorage()
         self.columns = ["query", "rank", "link", "title", "snippet", "html", "created"]
 
-
-
     def search(self, query):
         stored_results = self.storage.query_results(query)
         if stored_results.shape[0] > 0:

diff --git a/search/settings.py b/search/settings.py
@@ -1,6 +1,7 @@
 SEARCH_KEY = ""
 SEARCH_ID = ""
-SEARCH_URL = "https://www.googleapis.com/customsearch/v1?key={key}&cx={cx}&q={query}&start={start}&gl=us&num=10"
+COUNTRY = "us"
+SEARCH_URL = "https://www.googleapis.com/customsearch/v1?key={key}&cx={cx}&q={query}&start={start}&num=10&gl=" + COUNTRY
 RESULT_COUNT = 20
 
 import os

diff --git a/search/storage.py b/search/storage.py
@@ -18,6 +18,7 @@ def setup_tables(self):
                 snippet TEXT,
                 html TEXT,
                 created DATETIME,
+                relevance INTEGER,
                 UNIQUE(query, link)
             );
             """