Skip to content

Commit

Permalink
Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dag157 committed Feb 17, 2021
1 parent 0c5a4b7 commit 4604230
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 0 deletions.
Binary file added chromedriver
Binary file not shown.
Binary file added chromedriver_mac64 (1).zip
Binary file not shown.
61 changes: 61 additions & 0 deletions email_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import time
import requests
import os
import datetime
import heapq
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options



def scrape_craiglist():
chromedriver = os.getcwd() + "/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

options = Options()
options.headless = True
driver = webdriver.Chrome(chromedriver, chrome_options=options)

links = []

URL = "https://newyork.craigslist.org/d/resumes/search/rrr"
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='searchform')
job_elems = results.find_all('li', class_='result-row')

total_amt = 500
i = 0

for job_elem in job_elems:
link = job_elem.find('a')['href']
links.append(link)
if i > total_amt:
break
i += 1

file1 = open("test_emails.txt","w+")

for link in links:
driver.get(link)

button_field = driver.find_element_by_css_selector("button[role='button'][class='reply-button js-only']").click()

time.sleep(3)
htmlSource = driver.page_source

soupL = BeautifulSoup(htmlSource, 'html.parser')
results2 = soupL.findAll('input', class_='anonemail')
output = results2[0]['value']
file1.write(output)
file1.write("\n")

file1.close()

if __name__ == "__main__":
scrape_craiglist()
54 changes: 54 additions & 0 deletions test_emails.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
e5a5c6c77e5c36cea095d4108b68b8b7@res.craigslist.org
8efa646bf29a3ab0b754bdf01c8c7933@res.craigslist.org
6318d8fb214e33cea508001bf533753f@res.craigslist.org
ebd42e00b83b3c35b58d88f640f9e416@res.craigslist.org
7aa06ac54ebf3ceeb9c9eb577c8da7bf@res.craigslist.org
f0356f8fcdde3af6b6e0753d9e81a8db@res.craigslist.org
0774f5ee29b23754ab17d5db0a989a63@res.craigslist.org
53eca06960de391ebc73c6772ed295b6@res.craigslist.org
573787492dbf375c97228928ed1b2622@res.craigslist.org
885804d795eb38d383a6fed48d4f1a3c@res.craigslist.org
5c5745b9a8693fceb6f9017073250a43@res.craigslist.org
e076f321661d3f62b2e670f67bbd76f4@res.craigslist.org
8e3731f144363d8fb81fc30d99c94767@res.craigslist.org
c5e4af2c85443af8908aeebde30c897a@res.craigslist.org
b6048e82bc54319c928f83858e28d981@res.craigslist.org
50bfa4132d3a33dea4be0eefefa90cee@res.craigslist.org
c9f4cdbf8a653d8da4fb419941a9b94f@res.craigslist.org
1b569a28d7b43fc6a87ee006b16334ea@res.craigslist.org
10a86c3664bb360eaca6fb3390c7fffc@res.craigslist.org
b1403860ab2c3e1dae6bd2a4f213cb4a@res.craigslist.org
d996d389e6c13ef29ffc007a9f316f05@res.craigslist.org
5469f107d8c33f5095f26c3c8d47a540@res.craigslist.org
e57d8aa0110e3e0d81f9fedcb1a2ba72@res.craigslist.org
b9a78e94f4b93ee197ce3869ef82e13a@res.craigslist.org
e8b2f91de340399ca29257fdfe982af0@res.craigslist.org
15e26664630234378b8f2313a06509f0@res.craigslist.org
b86c5dfa9bf531a7be88ad41353d8c81@res.craigslist.org
8e47603127613646bdfe26d589001dc7@res.craigslist.org
5a2a57f30a313e59b8a92685c0096192@res.craigslist.org
d08ce0bc18273217a296d2018b0ef081@res.craigslist.org
d2a9677b7c77347997038692f56806a2@res.craigslist.org
aa82c67ea0c530e1a64278ee75a41c2c@res.craigslist.org
91d2949bf8a83120b857e667b0fb3ba7@res.craigslist.org
c9b4dd03dc7e3a86b51981e81048fb9e@res.craigslist.org
4f2caf24192234ef94674f44b68ef61f@res.craigslist.org
210099263e203780b582513ec2554646@res.craigslist.org
94edafa98a6a32bd94513857ef9a5492@res.craigslist.org
973a8af30b913b8baf9c3ed084619553@res.craigslist.org
b32ba4213bd832f0a87a67d32fe30a9b@res.craigslist.org
164f0c29429438869284e7fb27771f46@res.craigslist.org
6471c440ce123650a758f0a6bdf941a6@res.craigslist.org
d80ba2f7528d314488dbeb5679a932d0@res.craigslist.org
71fe0895a58138148d4f0eb8cf6fe11a@res.craigslist.org
8107955d4727388eaba1db0415be89e6@res.craigslist.org
3a823698386a39668b0f915088b8d62a@res.craigslist.org
4f741910e8c5351c9504cc251a1b5208@res.craigslist.org
32672ec31a1f3407a0af8de914ba6a4a@res.craigslist.org
4a963fb622b23caa85181240ee37f0db@res.craigslist.org
023805ea58da35208f8bd10a287eab28@res.craigslist.org
11622a3c063033a79db3f01c7ed9e1f1@res.craigslist.org
7c19675699993d0f90529334b2e89004@res.craigslist.org
de093f5dd6003bfe9079bb715b94dacc@res.craigslist.org
a7ac5598a1323cc788243c6309bd90e7@res.craigslist.org
f2b1556032113876a8e0cac35016a2a2@res.craigslist.org
40 changes: 40 additions & 0 deletions wsb_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import time
import requests
import os
import datetime
import heapq
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options



def scrape_wsb():
chromedriver = os.getcwd() + "/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
#driver = webdriver.Chrome(chromedriver)
#driver.get("https://newyork.craigslist.org/d/resumes/search/rrr")

options = Options()
options.headless = True
options.add_argument('--disable-extensions')
options.add_argument('--profile-directory=Default')
options.add_argument("--incognito")
options.add_argument("--disable-plugins-discovery")
options.add_argument("--start-maximized")
driver = webdriver.Chrome(chromedriver, chrome_options=options)

links = []

URL = "https://old.reddit.com/r/wallstreetbets/"
page = requests.get(URL)

#print(page.content)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup)

if __name__ == "__main__":
scrape_wsb()

0 comments on commit 4604230

Please sign in to comment.