diff --git a/chromedriver b/chromedriver new file mode 100755 index 0000000..a1d1661 Binary files /dev/null and b/chromedriver differ diff --git a/chromedriver_mac64 (1).zip b/chromedriver_mac64 (1).zip new file mode 100644 index 0000000..ab280da Binary files /dev/null and b/chromedriver_mac64 (1).zip differ diff --git a/email_scraper.py b/email_scraper.py new file mode 100644 index 0000000..ae1364c --- /dev/null +++ b/email_scraper.py @@ -0,0 +1,61 @@ +import time +import requests +import os +import datetime +import heapq +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.support.ui import Select +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.chrome.options import Options + + + +def scrape_craiglist(): + chromedriver = os.getcwd() + "/chromedriver" + os.environ["webdriver.chrome.driver"] = chromedriver + + options = Options() + options.headless = True + driver = webdriver.Chrome(chromedriver, chrome_options=options) + + links = [] + + URL = "https://newyork.craigslist.org/d/resumes/search/rrr" + page = requests.get(URL) + + soup = BeautifulSoup(page.content, 'html.parser') + results = soup.find(id='searchform') + job_elems = results.find_all('li', class_='result-row') + + total_amt = 500 + i = 0 + + for job_elem in job_elems: + link = job_elem.find('a')['href'] + links.append(link) + if i > total_amt: + break + i += 1 + + file1 = open("test_emails.txt","w+") + + for link in links: + driver.get(link) + + button_field = driver.find_element_by_css_selector("button[role='button'][class='reply-button js-only']").click() + + time.sleep(3) + htmlSource = driver.page_source + + soupL = BeautifulSoup(htmlSource, 'html.parser') + results2 = soupL.findAll('input', class_='anonemail') + output = results2[0]['value'] + file1.write(output) + file1.write("\n") + + file1.close() + +if __name__ == "__main__": + scrape_craiglist() \ No newline at end of file diff --git a/test_emails.txt b/test_emails.txt new file mode 100644 index 0000000..e0bdf9b --- /dev/null +++ b/test_emails.txt @@ -0,0 +1,54 @@ +e5a5c6c77e5c36cea095d4108b68b8b7@res.craigslist.org +8efa646bf29a3ab0b754bdf01c8c7933@res.craigslist.org +6318d8fb214e33cea508001bf533753f@res.craigslist.org +ebd42e00b83b3c35b58d88f640f9e416@res.craigslist.org +7aa06ac54ebf3ceeb9c9eb577c8da7bf@res.craigslist.org +f0356f8fcdde3af6b6e0753d9e81a8db@res.craigslist.org +0774f5ee29b23754ab17d5db0a989a63@res.craigslist.org +53eca06960de391ebc73c6772ed295b6@res.craigslist.org +573787492dbf375c97228928ed1b2622@res.craigslist.org +885804d795eb38d383a6fed48d4f1a3c@res.craigslist.org +5c5745b9a8693fceb6f9017073250a43@res.craigslist.org +e076f321661d3f62b2e670f67bbd76f4@res.craigslist.org +8e3731f144363d8fb81fc30d99c94767@res.craigslist.org +c5e4af2c85443af8908aeebde30c897a@res.craigslist.org +b6048e82bc54319c928f83858e28d981@res.craigslist.org +50bfa4132d3a33dea4be0eefefa90cee@res.craigslist.org +c9f4cdbf8a653d8da4fb419941a9b94f@res.craigslist.org +1b569a28d7b43fc6a87ee006b16334ea@res.craigslist.org +10a86c3664bb360eaca6fb3390c7fffc@res.craigslist.org +b1403860ab2c3e1dae6bd2a4f213cb4a@res.craigslist.org +d996d389e6c13ef29ffc007a9f316f05@res.craigslist.org +5469f107d8c33f5095f26c3c8d47a540@res.craigslist.org +e57d8aa0110e3e0d81f9fedcb1a2ba72@res.craigslist.org +b9a78e94f4b93ee197ce3869ef82e13a@res.craigslist.org +e8b2f91de340399ca29257fdfe982af0@res.craigslist.org +15e26664630234378b8f2313a06509f0@res.craigslist.org +b86c5dfa9bf531a7be88ad41353d8c81@res.craigslist.org +8e47603127613646bdfe26d589001dc7@res.craigslist.org +5a2a57f30a313e59b8a92685c0096192@res.craigslist.org +d08ce0bc18273217a296d2018b0ef081@res.craigslist.org +d2a9677b7c77347997038692f56806a2@res.craigslist.org +aa82c67ea0c530e1a64278ee75a41c2c@res.craigslist.org +91d2949bf8a83120b857e667b0fb3ba7@res.craigslist.org +c9b4dd03dc7e3a86b51981e81048fb9e@res.craigslist.org +4f2caf24192234ef94674f44b68ef61f@res.craigslist.org +210099263e203780b582513ec2554646@res.craigslist.org +94edafa98a6a32bd94513857ef9a5492@res.craigslist.org +973a8af30b913b8baf9c3ed084619553@res.craigslist.org +b32ba4213bd832f0a87a67d32fe30a9b@res.craigslist.org +164f0c29429438869284e7fb27771f46@res.craigslist.org +6471c440ce123650a758f0a6bdf941a6@res.craigslist.org +d80ba2f7528d314488dbeb5679a932d0@res.craigslist.org +71fe0895a58138148d4f0eb8cf6fe11a@res.craigslist.org +8107955d4727388eaba1db0415be89e6@res.craigslist.org +3a823698386a39668b0f915088b8d62a@res.craigslist.org +4f741910e8c5351c9504cc251a1b5208@res.craigslist.org +32672ec31a1f3407a0af8de914ba6a4a@res.craigslist.org +4a963fb622b23caa85181240ee37f0db@res.craigslist.org +023805ea58da35208f8bd10a287eab28@res.craigslist.org +11622a3c063033a79db3f01c7ed9e1f1@res.craigslist.org +7c19675699993d0f90529334b2e89004@res.craigslist.org +de093f5dd6003bfe9079bb715b94dacc@res.craigslist.org +a7ac5598a1323cc788243c6309bd90e7@res.craigslist.org +f2b1556032113876a8e0cac35016a2a2@res.craigslist.org diff --git a/wsb_scraper.py b/wsb_scraper.py new file mode 100644 index 0000000..71d763b --- /dev/null +++ b/wsb_scraper.py @@ -0,0 +1,40 @@ +import time +import requests +import os +import datetime +import heapq +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.support.ui import Select +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.chrome.options import Options + + + +def scrape_wsb(): + chromedriver = os.getcwd() + "/chromedriver" + os.environ["webdriver.chrome.driver"] = chromedriver + #driver = webdriver.Chrome(chromedriver) + #driver.get("https://newyork.craigslist.org/d/resumes/search/rrr") + + options = Options() + options.headless = True + options.add_argument('--disable-extensions') + options.add_argument('--profile-directory=Default') + options.add_argument("--incognito") + options.add_argument("--disable-plugins-discovery") + options.add_argument("--start-maximized") + driver = webdriver.Chrome(chromedriver, chrome_options=options) + + links = [] + + URL = "https://old.reddit.com/r/wallstreetbets/" + page = requests.get(URL) + + #print(page.content) + soup = BeautifulSoup(page.content, 'html.parser') + print(soup) + +if __name__ == "__main__": + scrape_wsb() \ No newline at end of file