Skip to content

Commit

Permalink
Add in README file
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Aug 9, 2022
1 parent cb9c6bc commit c5db8eb
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 60 deletions.
45 changes: 45 additions & 0 deletions search/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Project Overview

In this project, we'll build a search engine that uses filtering to reorder results. The engine will get results from the Google API, store them, then rank them based on filters we define. We'll end up with a basic search page and results list.

We'll use Pycharm, a common python IDE, to write our code and run it.

**Project Steps**

* Setup a programmable search engine [Custom Search API](https://developers.google.com/custom-search/v1/introduction)
* Create an [API key](https://console.cloud.google.com/apis/credentials) for the engine
* Create a module to search using the API
* Create a Flask application to search and render results
* Create filters to re-rank results before displaying them


## Code

You can find the code for this project [here](https://github.com/dataquestio/project-walkthroughs/tree/master/search).

File overview:

* `app.py` - the web interface
* `filter.py` - the code to filter results
* `search.py` - code to get the search results
* `settings.py` - settings needed by the other files
* `storage.py` - code to save the results to a database

# Local Setup

## Installation

To follow this project, please install the following locally:

* Python 3.9+
* Required Python packages (`pip install -r requirements.txt`)

### Other setup

You will need to create a programmable search engine and get an API key by following [these directions](https://developers.google.com/custom-search/v1/introduction).

### Other files

You'll need to download a list of ad and tracker urls from [here](https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt). We'll use this to filter out bad domains. Please save it as `blacklist.txt`.

I also recommend copying the [storage.py](https://github.com/dataquestio/project-walkthroughs/blob/master/search/storage.py) file into your directory before we start the project.
35 changes: 17 additions & 18 deletions search/filter.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from settings import *
import re

def count_trackers(row):
with open("blacklist.txt") as f:
domains = set(f.read().split("\n"))

def tracker_urls(row):
soup = BeautifulSoup(row["html"])
domain = urlparse(row["link"]).hostname
scripts = soup.find_all("script", {"src": True})
srcs = [s.get("src") for s in scripts]
bad_srcs = [s for s in srcs if ".." not in s and domain not in s and "cdn" not in s]
return len(bad_srcs)

links = soup.find_all("a", {"href": True})
href = [l.get("href") for l in links]

all_domains = [urlparse(s).hostname for s in srcs + href]
return len([a for a in all_domains if a in domains])

def get_page_content(row):
soup = BeautifulSoup(row["html"])
Expand All @@ -20,29 +25,23 @@ class Filter():
def __init__(self, results):
self.filtered = results.copy()

def js_filter(self):
tracker_count = self.filtered.apply(count_trackers, axis=1)
def tracker_filter(self):
tracker_count = self.filtered.apply(tracker_urls, axis=1)
tracker_count[tracker_count > tracker_count.median()] = RESULT_COUNT
self.filtered["rank"] += tracker_count
self.filtered["rank"] += tracker_count * 2

def content_filter(self):
page_content = self.filtered.apply(get_page_content, axis=1)
word_count = page_content.apply(lambda x: len(x.split(" ")))
median = word_count.median()
word_count /= median

word_count /= word_count.median()
word_count[word_count <= .5] = RESULT_COUNT
word_count[word_count != RESULT_COUNT] = 0
self.filtered["rank"] += word_count

def year_filter(self):
titles = self.filtered["title"]
year_in_title = titles.apply(lambda x: len(re.findall(r"20\d{2}", x)))
year_in_title[year_in_title > 0] = RESULT_COUNT
self.filtered["rank"] += year_in_title

def filter(self):
self.js_filter()
self.tracker_filter()
self.content_filter()
self.year_filter()
self.filtered = self.filtered.sort_values("rank", ascending=True)
self.filtered["rank"] = self.filtered["rank"].round()
return self.filtered
3 changes: 2 additions & 1 deletion search/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
flask
pandas
requests
beautifulsoup4
beautifulsoup4
adblockparser
47 changes: 7 additions & 40 deletions search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,18 @@
import pandas as pd
from storage import DBStorage
from datetime import datetime
import time

"""
def search_scrape(query):
res = list(gsearch(query, num=RESULT_COUNT))
res_df = pd.DataFrame({"link": res, "rank": list(range(1, len(res) + 1))})
return res_df
"""

def format_query(query):
return query.replace(" ", "+")

def search_url(query, start=1):
query = format_query(query)
return SEARCH_URL.format(key=SEARCH_KEY, cx=SEARCH_ID, query=query, start=start)
from urllib.parse import quote_plus

def search_api(query, pages=int(RESULT_COUNT/10)):
query = format_query(query)

results = []
for i in range(0, pages):
start = i*10+1
url = search_url(query, start=start)
url = SEARCH_URL.format(
key=SEARCH_KEY,
cx=SEARCH_ID,
query=quote_plus(query),
start=start
)
response = requests.get(url)
data = response.json()
results += data["items"]
Expand All @@ -35,26 +24,6 @@ def search_api(query, pages=int(RESULT_COUNT/10)):
res_df = res_df[["link", "rank", "snippet", "title"]]
return res_df

"""
def scrape_page(links):
html = []
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
for link in links:
print(link)
try:
page.goto(link, wait_until="load")
html.append(page.content())
except PlaywrightTimeoutError:
html.append("")
except PlaywrightError:
time.sleep(.5)
html.append(page.content())
browser.close()
return html
"""

def scrape_page(links):
html = []
for link in links:
Expand All @@ -72,8 +41,6 @@ def __init__(self):
self.storage = DBStorage()
self.columns = ["query", "rank", "link", "title", "snippet", "html", "created"]



def search(self, query):
stored_results = self.storage.query_results(query)
if stored_results.shape[0] > 0:
Expand Down
3 changes: 2 additions & 1 deletion search/settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
SEARCH_KEY = ""
SEARCH_ID = ""
SEARCH_URL = "https://www.googleapis.com/customsearch/v1?key={key}&cx={cx}&q={query}&start={start}&gl=us&num=10"
COUNTRY = "us"
SEARCH_URL = "https://www.googleapis.com/customsearch/v1?key={key}&cx={cx}&q={query}&start={start}&num=10&gl=" + COUNTRY
RESULT_COUNT = 20

import os
Expand Down
1 change: 1 addition & 0 deletions search/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def setup_tables(self):
snippet TEXT,
html TEXT,
created DATETIME,
relevance INTEGER,
UNIQUE(query, link)
);
"""
Expand Down

0 comments on commit c5db8eb

Please sign in to comment.