Skip to content

Commit

Permalink
addning midem.py
Browse files Browse the repository at this point in the history
  • Loading branch information
mknithin committed Nov 25, 2015
1 parent 80f6668 commit 9e92ee9
Showing 1 changed file with 130 additions and 0 deletions.
130 changes: 130 additions & 0 deletions midem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python
import requests
from lxml import html
import json #for the google search
import urllib # for the google search
import time
import email_extractor as e #for the email extract
from collections import defaultdict #for the email extract
import os.path #for output file deletion

class MipCrawler:
def __init__(self,starting_url,depth,start_record):
self.starting_url=starting_url
self.depth=depth
self.companies=[]
self.company_link=[]
self.email_out=[]
self.start_record=start_record

def crawl(self):

cmpn=self.get_company_from_link(self.starting_url)
self.companies.extend(cmpn.company_name)

print "*Finished Extracting the company names"
#print(self.companies)

self.get_company_from_google(self.companies)
'''
#link=['http://www.mediafrance.eu/']
self.company_link.extend(link)
print "*Finished google search for company names"
#print(self.company_link)
#to get the email from the company website
#print(self.company_link)
email=self.get_email_from_link(self.company_link,self.depth)
self.email_out.extend(email)
print "*Finished extracting emails"
#print(self.email_out)
self.put_email_to_file(self.email_out)
print "*Fininshed writing emails to file <output.txt>"
'''

def get_company_from_link(self,link):
start_page=requests.get(link)
tree=html.fromstring(start_page.text)
name=tree.xpath('//h3[@class="name"]//a/text()')
#for item in name:
# print(item)
cmpn=Company(name)
return cmpn

def get_company_from_google(self,company_list):
link=[]
cmpn_no=1
#loc_list=['"MCFIVA (THAILAND) CO.,LTD."','"MIR" INTERGOVERNMENTAL TV AND RADIO.']
for cmpn in company_list:
print "--------------------------------------------------------"
print "Google Search for : %d.%s(track:%d)" %(cmpn_no,cmpn,(self.start_record+(cmpn_no-1)))
query = urllib.urlencode({'q': cmpn.encode("utf-8")})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query
search_response = urllib.urlopen(url)
search_results = search_response.read()
results = json.loads(search_results)
if results is not None:
data = results['responseData']
hits = data['results']
for h in hits:
#link.append((hits[0]['url']).encode("utf-8"))
#link.append((hits[1]['url']).encode("utf-8"))
link=(h['url']).encode("utf-8")
if "imdb" in link or "facebook" in link or "youtube" in link or "linkedin" in link or "wikipedia" in link or "my-mip" in link:
continue
print link
email=self.get_email_from_link(link,self.depth)
if email is not None:
self.put_email_to_file(email)
else:
continue
cmpn_no+=1
def get_email_from_link(self,link,depth):
email_link=[]
print "Extracting emails from %s" %link
try:
emails = defaultdict(int)
for url in e.crawl_site('%s' %link, depth):
for email in e.grab_email(e.urltext(url)):
if not emails.has_key(email):
if('reedmidem.com' in email or '.png' in email or '.jpg' in email):
continue
else:
email_link.append(email)
except:
print "Socket connection error"
email_link=''
return email_link
return email_link

def put_email_to_file(self,email):
data=open("output.txt",'a')
for e in email:
data.write(e)
data.write("\n")
data.close()



class Company:

def __init__(self,company_name):
self.company_name=company_name

#def __str__(self):
# return str(self.company_name)

if __name__ == '__main__':
if os.path.isfile("output.txt"):
os.remove("output.txt")
base_url='http://my.midem.com/en/online-database/companies/?rpp=64&startRecord='
batch=0
start_record=1
while start_record < 2203:
print "Batch:%d Start_record:%d"%((batch+1),start_record)
crawler=MipCrawler('%s' %base_url+str(start_record),10,start_record)
crawler.crawl()
batch+=1
start_record+=64

0 comments on commit 9e92ee9

Please sign in to comment.