Skip to content

Commit

Permalink
Revision 1
Browse files Browse the repository at this point in the history
  • Loading branch information
mknithin committed Oct 27, 2015
0 parents commit 5a4eea1
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 0 deletions.
180 changes: 180 additions & 0 deletions email_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# -*- coding: utf-8 -*-
#! /usr/bin/env python
__thisfile__ = "http://www.jaist.ac.jp/~s1010205/email_extractor/email_extractor.py"
"""
Web Data Extractor, extract emails by sitecrawl
Copyright (C) 2011 KATHURIA Pulkit
Contact: pulkit@jaist.ac.jp
Contributors:
Open Source Sitemap Generator sitemap_gen by Vladimir Toncar
http://toncar.cz/opensource/sitemap_gen.html
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
import re
import commands
from urllib import urlopen
from collections import defaultdict
import argparse
import string
import urllib2
import urlparse
from HTMLParser import HTMLParser
from HTMLParser import HTMLParseError
import robotparser
import httplib

def getPage(url):
try:
f = urllib2.urlopen(url)
page = ""
for i in f.readlines():
page += i
date = f.info().getdate('Last-Modified')
if date == None:
date = (0, 0, 0)
else:
date = date[:3]
f.close()
return (page, date, f.url)
except urllib2.URLError, detail:
pass
return (None, (0,0,0), "")

def joinUrls(baseUrl, newUrl):
helpUrl, fragment = urlparse.urldefrag(newUrl)
return urlparse.urljoin(baseUrl, helpUrl)

def getRobotParser(startUrl):
rp = robotparser.RobotFileParser()
robotUrl = urlparse.urljoin(startUrl, "/robots.txt")
page, date, url = getPage(robotUrl)

if page == None:
return None
rp.parse(page)
return rp

class MyHTMLParser(HTMLParser):
def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
HTMLParser.__init__(self)
self.pageMap = pageMap
self.redirects = redirects
self.baseUrl = baseUrl
self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5
self.maxUrls = maxUrls
self.blockExtensions = blockExtensions
self.robotParser = robotParser
def hasBlockedExtension(self, url):
p = urlparse.urlparse(url)
path = p[2].upper() # path attribute
for i in self.blockExtensions:
if path.endswith(i):
return 1
return 0
def handle_starttag(self, tag, attrs):
if len(self.pageMap) >= self.maxUrls:
return
if (tag.upper() == "BASE"):
if (attrs[0][0].upper() == "HREF"):
self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
if (tag.upper() == "A"):
url = ""
for attr in attrs:
if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
return
elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
url = joinUrls(self.baseUrl, attr[1])
if url == "": return
if urlparse.urlsplit(url)[1] <> self.server:
return
if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
return
if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)):
return
if not(self.pageMap.has_key(url)):
self.pageMap[url] = ()

def getUrlToProcess(pageMap):
for i in pageMap.keys():
if pageMap[i] == ():
return i
return None

def parsePages(startUrl, maxUrls, blockExtensions):
pageMap = {}
pageMap[startUrl] = ()
redirects = []
robotParser = getRobotParser(startUrl)
while True:
url = getUrlToProcess(pageMap)
if url == None:
break
#print " ", url
page, date, newUrl = getPage(url)
if page == None:
del pageMap[url]
elif url != newUrl:
#print newUrl
del pageMap[url]
pageMap[newUrl] = ()
redirects.append(url)
else:
pageMap[url] = date
parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
try:
parser.feed(page)
parser.close()
except HTMLParseError:
pass
except UnicodeDecodeError:
pass
return pageMap

def grab_email(text):
found = []
mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
for line in text:
found.extend(mailsrch.findall(line))
u = {}
for item in found:
u[item] = 1
return u.keys()

def urltext(url):
viewsource = urlopen(url).readlines()
return viewsource

def crawl_site(url, limit):
return parsePages(url, limit, 'None')

'''if __name__ == '__main__':
parser = argparse.ArgumentParser(add_help = True)
parser = argparse.ArgumentParser(description= 'Web Email Extractor')
parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl')
parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com')
myarguments = parser.parse_args()
emails = defaultdict(int)
for url in crawl_site(myarguments.url, myarguments.limit):
for email in grab_email(urltext(url)):
if not emails.has_key(email): print email
emails[email] += 1
'''





102 changes: 102 additions & 0 deletions mip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python
import requests
from lxml import html
import json #for the google search
import urllib # for the google search
import time
import email_extractor as e #for the email extract
from collections import defaultdict #for the email extract
import os.path #for output file deletion
class MipCrawler:
def __init__(self,starting_url,depth):
self.starting_url=starting_url
self.depth=depth
self.companies=[]
self.company_link=[]
self.email_out=[]

def crawl(self):
cmpn=self.get_company_from_link(self.starting_url)
self.companies.extend(cmpn.company_name)

print "*Finished Extracting the company names"
#print(self.companies)

link=self.get_company_from_google(self.companies)
#link=['http://www.mediafrance.eu/']
self.company_link.extend(link)
print "*Finished google serach for company names"
#print(self.company_link)


#to get the email from the company website
#print(self.company_link)

email=self.get_email_from_link(link)
self.email_out.extend(email)
print "*Finished extracting emails"
#print(self.email_out)
self.put_email_to_file(self.email_out)
print "*Fininshed writing emails to file <output.txt>"

def get_company_from_link(self,link):
start_page=requests.get(link)
tree=html.fromstring(start_page.text)
name=tree.xpath('//h3[@class="name"]//a/text()')
#for item in name:
# print(item)
cmpn=Company(name)
return cmpn

def get_company_from_google(self,company_list):
link=[]
#loc_list=['"MCFIVA (THAILAND) CO.,LTD."','"MIR" INTERGOVERNMENTAL TV AND RADIO.']
for cmpn in company_list:
query = urllib.urlencode({'q': cmpn})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query
search_response = urllib.urlopen(url)
search_results = search_response.read()
results = json.loads(search_results)
data = results['responseData']
hits = data['results']
link.append((hits[0]['url']).encode("utf-8"))
time.sleep(35)
return link

def get_email_from_link(self,link):
email_link=[]
print "Starting email extraction >>>>>>"
emails = defaultdict(int)
for site in link:
for url in e.crawl_site('%s' %site, 10):
for email in e.grab_email(e.urltext(url)):
if not emails.has_key(email):email_link.append(email)
return email_link

def put_email_to_file(self,email):
data=open("output.txt",'a')
for e in email:
data.write(e)
data.write("\n")
data.close()



class Company:

def __init__(self,company_name):
self.company_name=company_name

#def __str__(self):
# return str(self.company_name)

if __name__ == '__main__':
if os.path.isfile("output.txt"):
os.remove("output.txt")
page_urls=['http://www.my-mip.com/online-database/mipcom/companies/#search=rpp%3D64',
'http://www.my-mip.com/online-database/mipcom/companies/#search=rpp%3D64%26startRecord%3D65',
]
for url in page_urls:
crawler=MipCrawler('%s' %url,0)
crawler.crawl()

0 comments on commit 5a4eea1

Please sign in to comment.