Skip to content

Commit

Permalink
Revision 3
Browse files Browse the repository at this point in the history
  • Loading branch information
mknithin committed Oct 27, 2015
1 parent c3f3455 commit 698bc0b
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions mip.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(self,starting_url,depth):
self.email_out=[]

def crawl(self):

cmpn=self.get_company_from_link(self.starting_url)
self.companies.extend(cmpn.company_name)

Expand All @@ -25,7 +26,7 @@ def crawl(self):
link=self.get_company_from_google(self.companies)
#link=['http://www.mediafrance.eu/']
self.company_link.extend(link)
print "*Finished google serach for company names"
print "*Finished google search for company names"
#print(self.company_link)


Expand Down Expand Up @@ -59,7 +60,8 @@ def get_company_from_google(self,company_list):
results = json.loads(search_results)
data = results['responseData']
hits = data['results']
link.append((hits[0]['url']).encode("utf-8"))
for h in hits:
link.append((h['url']).encode("utf-8"))
time.sleep(35)
return link

Expand All @@ -70,7 +72,11 @@ def get_email_from_link(self,link,depth):
for site in link:
for url in e.crawl_site('%s' %site, depth):
for email in e.grab_email(e.urltext(url)):
if not emails.has_key(email):email_link.append(email)
if not emails.has_key(email):
if('reedmidem.com' in email):
continue
else:
email_link.append(email)
return email_link

def put_email_to_file(self,email):
Expand Down Expand Up @@ -170,7 +176,7 @@ def __init__(self,company_name):
batch=0
for url in page_urls:
print "Batch:%d"%(batch+1)
crawler=MipCrawler('%s' %base_url+url,5)
crawler=MipCrawler('%s' %base_url+url,10)
crawler.crawl()
batch+=1

0 comments on commit 698bc0b

Please sign in to comment.