-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 5a4eea1
Showing
2 changed files
with
282 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# -*- coding: utf-8 -*- | ||
#! /usr/bin/env python | ||
__thisfile__ = "http://www.jaist.ac.jp/~s1010205/email_extractor/email_extractor.py" | ||
""" | ||
Web Data Extractor, extract emails by sitecrawl | ||
Copyright (C) 2011 KATHURIA Pulkit | ||
Contact: pulkit@jaist.ac.jp | ||
Contributors: | ||
Open Source Sitemap Generator sitemap_gen by Vladimir Toncar | ||
http://toncar.cz/opensource/sitemap_gen.html | ||
This program is free software; you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation; either version 3 of the License, or | ||
(at your option) any later version. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
""" | ||
import sys | ||
import re | ||
import commands | ||
from urllib import urlopen | ||
from collections import defaultdict | ||
import argparse | ||
import string | ||
import urllib2 | ||
import urlparse | ||
from HTMLParser import HTMLParser | ||
from HTMLParser import HTMLParseError | ||
import robotparser | ||
import httplib | ||
|
||
def getPage(url): | ||
try: | ||
f = urllib2.urlopen(url) | ||
page = "" | ||
for i in f.readlines(): | ||
page += i | ||
date = f.info().getdate('Last-Modified') | ||
if date == None: | ||
date = (0, 0, 0) | ||
else: | ||
date = date[:3] | ||
f.close() | ||
return (page, date, f.url) | ||
except urllib2.URLError, detail: | ||
pass | ||
return (None, (0,0,0), "") | ||
|
||
def joinUrls(baseUrl, newUrl): | ||
helpUrl, fragment = urlparse.urldefrag(newUrl) | ||
return urlparse.urljoin(baseUrl, helpUrl) | ||
|
||
def getRobotParser(startUrl): | ||
rp = robotparser.RobotFileParser() | ||
robotUrl = urlparse.urljoin(startUrl, "/robots.txt") | ||
page, date, url = getPage(robotUrl) | ||
|
||
if page == None: | ||
return None | ||
rp.parse(page) | ||
return rp | ||
|
||
class MyHTMLParser(HTMLParser): | ||
def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser): | ||
HTMLParser.__init__(self) | ||
self.pageMap = pageMap | ||
self.redirects = redirects | ||
self.baseUrl = baseUrl | ||
self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5 | ||
self.maxUrls = maxUrls | ||
self.blockExtensions = blockExtensions | ||
self.robotParser = robotParser | ||
def hasBlockedExtension(self, url): | ||
p = urlparse.urlparse(url) | ||
path = p[2].upper() # path attribute | ||
for i in self.blockExtensions: | ||
if path.endswith(i): | ||
return 1 | ||
return 0 | ||
def handle_starttag(self, tag, attrs): | ||
if len(self.pageMap) >= self.maxUrls: | ||
return | ||
if (tag.upper() == "BASE"): | ||
if (attrs[0][0].upper() == "HREF"): | ||
self.baseUrl = joinUrls(self.baseUrl, attrs[0][1]) | ||
if (tag.upper() == "A"): | ||
url = "" | ||
for attr in attrs: | ||
if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1): | ||
return | ||
elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1): | ||
url = joinUrls(self.baseUrl, attr[1]) | ||
if url == "": return | ||
if urlparse.urlsplit(url)[1] <> self.server: | ||
return | ||
if self.hasBlockedExtension(url) or self.redirects.count(url) > 0: | ||
return | ||
if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)): | ||
return | ||
if not(self.pageMap.has_key(url)): | ||
self.pageMap[url] = () | ||
|
||
def getUrlToProcess(pageMap): | ||
for i in pageMap.keys(): | ||
if pageMap[i] == (): | ||
return i | ||
return None | ||
|
||
def parsePages(startUrl, maxUrls, blockExtensions): | ||
pageMap = {} | ||
pageMap[startUrl] = () | ||
redirects = [] | ||
robotParser = getRobotParser(startUrl) | ||
while True: | ||
url = getUrlToProcess(pageMap) | ||
if url == None: | ||
break | ||
#print " ", url | ||
page, date, newUrl = getPage(url) | ||
if page == None: | ||
del pageMap[url] | ||
elif url != newUrl: | ||
#print newUrl | ||
del pageMap[url] | ||
pageMap[newUrl] = () | ||
redirects.append(url) | ||
else: | ||
pageMap[url] = date | ||
parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser) | ||
try: | ||
parser.feed(page) | ||
parser.close() | ||
except HTMLParseError: | ||
pass | ||
except UnicodeDecodeError: | ||
pass | ||
return pageMap | ||
|
||
def grab_email(text): | ||
found = [] | ||
mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}') | ||
for line in text: | ||
found.extend(mailsrch.findall(line)) | ||
u = {} | ||
for item in found: | ||
u[item] = 1 | ||
return u.keys() | ||
|
||
def urltext(url): | ||
viewsource = urlopen(url).readlines() | ||
return viewsource | ||
|
||
def crawl_site(url, limit): | ||
return parsePages(url, limit, 'None') | ||
|
||
'''if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(add_help = True) | ||
parser = argparse.ArgumentParser(description= 'Web Email Extractor') | ||
parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl') | ||
parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com') | ||
myarguments = parser.parse_args() | ||
emails = defaultdict(int) | ||
for url in crawl_site(myarguments.url, myarguments.limit): | ||
for email in grab_email(urltext(url)): | ||
if not emails.has_key(email): print email | ||
emails[email] += 1 | ||
''' | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/usr/bin/env python | ||
import requests | ||
from lxml import html | ||
import json #for the google search | ||
import urllib # for the google search | ||
import time | ||
import email_extractor as e #for the email extract | ||
from collections import defaultdict #for the email extract | ||
import os.path #for output file deletion | ||
class MipCrawler: | ||
def __init__(self,starting_url,depth): | ||
self.starting_url=starting_url | ||
self.depth=depth | ||
self.companies=[] | ||
self.company_link=[] | ||
self.email_out=[] | ||
|
||
def crawl(self): | ||
cmpn=self.get_company_from_link(self.starting_url) | ||
self.companies.extend(cmpn.company_name) | ||
|
||
print "*Finished Extracting the company names" | ||
#print(self.companies) | ||
|
||
link=self.get_company_from_google(self.companies) | ||
#link=['http://www.mediafrance.eu/'] | ||
self.company_link.extend(link) | ||
print "*Finished google serach for company names" | ||
#print(self.company_link) | ||
|
||
|
||
#to get the email from the company website | ||
#print(self.company_link) | ||
|
||
email=self.get_email_from_link(link) | ||
self.email_out.extend(email) | ||
print "*Finished extracting emails" | ||
#print(self.email_out) | ||
self.put_email_to_file(self.email_out) | ||
print "*Fininshed writing emails to file <output.txt>" | ||
|
||
def get_company_from_link(self,link): | ||
start_page=requests.get(link) | ||
tree=html.fromstring(start_page.text) | ||
name=tree.xpath('//h3[@class="name"]//a/text()') | ||
#for item in name: | ||
# print(item) | ||
cmpn=Company(name) | ||
return cmpn | ||
|
||
def get_company_from_google(self,company_list): | ||
link=[] | ||
#loc_list=['"MCFIVA (THAILAND) CO.,LTD."','"MIR" INTERGOVERNMENTAL TV AND RADIO.'] | ||
for cmpn in company_list: | ||
query = urllib.urlencode({'q': cmpn}) | ||
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query | ||
search_response = urllib.urlopen(url) | ||
search_results = search_response.read() | ||
results = json.loads(search_results) | ||
data = results['responseData'] | ||
hits = data['results'] | ||
link.append((hits[0]['url']).encode("utf-8")) | ||
time.sleep(35) | ||
return link | ||
|
||
def get_email_from_link(self,link): | ||
email_link=[] | ||
print "Starting email extraction >>>>>>" | ||
emails = defaultdict(int) | ||
for site in link: | ||
for url in e.crawl_site('%s' %site, 10): | ||
for email in e.grab_email(e.urltext(url)): | ||
if not emails.has_key(email):email_link.append(email) | ||
return email_link | ||
|
||
def put_email_to_file(self,email): | ||
data=open("output.txt",'a') | ||
for e in email: | ||
data.write(e) | ||
data.write("\n") | ||
data.close() | ||
|
||
|
||
|
||
class Company: | ||
|
||
def __init__(self,company_name): | ||
self.company_name=company_name | ||
|
||
#def __str__(self): | ||
# return str(self.company_name) | ||
|
||
if __name__ == '__main__': | ||
if os.path.isfile("output.txt"): | ||
os.remove("output.txt") | ||
page_urls=['http://www.my-mip.com/online-database/mipcom/companies/#search=rpp%3D64', | ||
'http://www.my-mip.com/online-database/mipcom/companies/#search=rpp%3D64%26startRecord%3D65', | ||
] | ||
for url in page_urls: | ||
crawler=MipCrawler('%s' %url,0) | ||
crawler.crawl() | ||
|