Revision 1

mknithin-organization · Oct 27, 2015 · 5a4eea1 · 5a4eea1
commit 5a4eea1
Show file tree

Hide file tree

Showing 2 changed files with 282 additions and 0 deletions.
diff --git a/email_extractor.py b/email_extractor.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+#! /usr/bin/env python
+__thisfile__ = "http://www.jaist.ac.jp/~s1010205/email_extractor/email_extractor.py"
+"""
+    Web Data Extractor, extract emails by sitecrawl
+    Copyright (C) 2011 KATHURIA Pulkit
+    Contact: pulkit@jaist.ac.jp
+
+    Contributors:
+        Open Source Sitemap Generator sitemap_gen by Vladimir Toncar
+        http://toncar.cz/opensource/sitemap_gen.html
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import sys
+import re
+import commands
+from urllib import urlopen
+from collections import defaultdict
+import argparse
+import string
+import urllib2
+import urlparse
+from HTMLParser import HTMLParser
+from HTMLParser import HTMLParseError
+import robotparser
+import httplib
+
+def getPage(url):
+    try:
+        f = urllib2.urlopen(url)
+        page = ""
+        for i in f.readlines():
+            page += i
+        date = f.info().getdate('Last-Modified')
+        if date == None:
+            date = (0, 0, 0)
+        else:
+            date = date[:3]
+        f.close()
+        return (page, date, f.url)
+    except urllib2.URLError, detail:
+        pass
+        return (None, (0,0,0), "")
+
+def joinUrls(baseUrl, newUrl):
+	helpUrl, fragment = urlparse.urldefrag(newUrl)
+        return urlparse.urljoin(baseUrl, helpUrl)
+
+def getRobotParser(startUrl):
+	rp = robotparser.RobotFileParser()
+	robotUrl = urlparse.urljoin(startUrl, "/robots.txt")
+	page, date, url = getPage(robotUrl)
+
+	if page == None:
+	    return None
+	rp.parse(page)
+	return rp
+
+class MyHTMLParser(HTMLParser):
+    def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
+        HTMLParser.__init__(self)
+        self.pageMap = pageMap
+	self.redirects = redirects
+        self.baseUrl = baseUrl
+        self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5
+        self.maxUrls = maxUrls
+        self.blockExtensions = blockExtensions
+	self.robotParser = robotParser
+    def hasBlockedExtension(self, url):
+        p = urlparse.urlparse(url)
+        path = p[2].upper() # path attribute
+        for i in self.blockExtensions:
+            if path.endswith(i):
+                return 1
+        return 0
+    def handle_starttag(self, tag, attrs):
+        if len(self.pageMap) >= self.maxUrls:
+            return
+        if (tag.upper() == "BASE"):
+	    if (attrs[0][0].upper() == "HREF"):
+		self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
+        if (tag.upper() == "A"):
+            url = ""
+	    for attr in attrs:
+                if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
+                    return  
+                elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
+                    url = joinUrls(self.baseUrl, attr[1])
+            if url == "": return
+            if urlparse.urlsplit(url)[1] <> self.server:
+                return
+            if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
+                return
+            if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)):
+                return
+            if not(self.pageMap.has_key(url)):
+                self.pageMap[url] = ()
+
+def getUrlToProcess(pageMap):
+    for i in pageMap.keys():
+        if pageMap[i] == ():
+            return i
+    return None
+
+def parsePages(startUrl, maxUrls, blockExtensions):
+    pageMap = {}
+    pageMap[startUrl] = ()
+    redirects = []
+    robotParser = getRobotParser(startUrl)
+    while True:
+        url = getUrlToProcess(pageMap)
+        if url == None:
+            break
+        #print " ", url
+        page, date, newUrl = getPage(url)
+        if page == None:
+            del pageMap[url]
+	elif url != newUrl:
+	    #print newUrl
+            del pageMap[url]
+	    pageMap[newUrl] = ()
+	    redirects.append(url)
+        else:
+            pageMap[url] = date
+            parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
+            try:
+                parser.feed(page)
+                parser.close()
+            except HTMLParseError:
+                pass
+            except UnicodeDecodeError:
+                pass
+    return pageMap
+
+def grab_email(text):
+    found = []
+    mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
+    for line in text:
+        found.extend(mailsrch.findall(line))    
+    u = {}
+    for item in found:
+        u[item] = 1
+    return u.keys()
+
+def urltext(url):
+    viewsource = urlopen(url).readlines()
+    return viewsource
+
+def crawl_site(url, limit):
+    return parsePages(url, limit, 'None')
+
+'''if __name__ == '__main__':
+    parser = argparse.ArgumentParser(add_help = True)
+    parser = argparse.ArgumentParser(description= 'Web Email Extractor')
+    parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl')
+    parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com')
+    myarguments = parser.parse_args()
+    emails = defaultdict(int)
+    for url in crawl_site(myarguments.url, myarguments.limit):
+        for email in grab_email(urltext(url)):
+            if not emails.has_key(email): print email
+            emails[email] += 1
+'''
+
+
+
+
+
diff --git a/mip.py b/mip.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+import requests
+from lxml import html
+import json #for the google search
+import urllib # for the google search 
+import time
+import email_extractor as e #for the email extract
+from collections import defaultdict #for the email extract 
+import os.path #for output file deletion
+class MipCrawler:
+	def __init__(self,starting_url,depth):
+		self.starting_url=starting_url
+		self.depth=depth
+		self.companies=[]
+		self.company_link=[]
+		self.email_out=[]
+
+	def crawl(self):
+		cmpn=self.get_company_from_link(self.starting_url)
+		self.companies.extend(cmpn.company_name)
+
+		print "*Finished Extracting the company names"
+		#print(self.companies)
+
+		link=self.get_company_from_google(self.companies)
+		#link=['http://www.mediafrance.eu/']
+		self.company_link.extend(link)
+		print "*Finished google serach for company names"
+		#print(self.company_link)
+
+
+		#to get the email from the company website 
+		#print(self.company_link)
+
+		email=self.get_email_from_link(link)
+		self.email_out.extend(email)
+		print "*Finished extracting emails"
+		#print(self.email_out)
+		self.put_email_to_file(self.email_out)
+		print "*Fininshed writing emails to file <output.txt>"
+
+	def get_company_from_link(self,link):
+		start_page=requests.get(link)
+		tree=html.fromstring(start_page.text)
+		name=tree.xpath('//h3[@class="name"]//a/text()')
+		#for item in name:
+		#	print(item)
+		cmpn=Company(name)
+		return cmpn
+
+	def get_company_from_google(self,company_list):
+		link=[]
+		#loc_list=['"MCFIVA (THAILAND) CO.,LTD."','"MIR" INTERGOVERNMENTAL TV AND RADIO.']
+		for cmpn in company_list:
+			query = urllib.urlencode({'q': cmpn})
+  			url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query
+  			search_response = urllib.urlopen(url)
+  			search_results = search_response.read()
+  			results = json.loads(search_results)
+  			data = results['responseData']
+  			hits = data['results']
+			link.append((hits[0]['url']).encode("utf-8"))
+			time.sleep(35)
+		return link 
+
+	def get_email_from_link(self,link):
+		email_link=[]
+		print "Starting email extraction >>>>>>"
+		emails = defaultdict(int)
+		for site in link:
+			for url in e.crawl_site('%s' %site, 10):
+				for email in e.grab_email(e.urltext(url)):
+					if not emails.has_key(email):email_link.append(email)
+		return email_link
+
+	def put_email_to_file(self,email):
+		data=open("output.txt",'a')
+		for e in email:
+			data.write(e)
+			data.write("\n")
+		data.close()
+
+
+
+class Company:
+
+	def __init__(self,company_name):
+		self.company_name=company_name
+
+	#def __str__(self):
+	#	return str(self.company_name)
+
+if __name__ == '__main__':
+	if os.path.isfile("output.txt"):
+		os.remove("output.txt")
+	page_urls=['http://www.my-mip.com/online-database/mipcom/companies/#search=rpp%3D64',
+		'http://www.my-mip.com/online-database/mipcom/companies/#search=rpp%3D64%26startRecord%3D65',
+	]
+	for url in page_urls:
+		crawler=MipCrawler('%s' %url,0)
+		crawler.crawl()
+