From dd08e02235177c61679e885191c77d8ee1f12997 Mon Sep 17 00:00:00 2001 From: nithin-mk Date: Mon, 2 Nov 2015 20:04:30 +0530 Subject: [PATCH] Revision 9 Socket handling --- cannes.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/cannes.py b/cannes.py index 3ce94aa..d61aa96 100755 --- a/cannes.py +++ b/cannes.py @@ -53,7 +53,9 @@ def get_company_from_file(self,file_name): def get_company_from_google(self,company_list): #link=[] #loc_list=['"MCFIVA (THAILAND) CO.,LTD."','"MIR" INTERGOVERNMENTAL TV AND RADIO.'] + black_list=['http://www.imdb.com','https://www.facebook.com',' http://www.youtube.com/','https://www.linkedin.com/',' https://en.wikipedia.org'] for cmpn in company_list: + print "Searching emails for : %s" %cmpn query = urllib.urlencode({'q': cmpn}) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query search_response = urllib.urlopen(url) @@ -66,6 +68,8 @@ def get_company_from_google(self,company_list): #print h['url'] #link.append((h['url']).encode("utf-8")) link=(h['url']).encode("utf-8") + if link in black_list: + continue print link email=self.get_email_from_link(link,self.depth) self.put_email_to_file(email) @@ -76,12 +80,15 @@ def get_email_from_link(self,link,depth): print "Extracting emails >>>>>>" emails = defaultdict(int) for url in e.crawl_site('%s' %link, depth): - for email in e.grab_email(e.urltext(url)): - if not emails.has_key(email): - if('reedmidem.com' in email): - continue - else: - email_link.append(email) + try: + for email in e.grab_email(e.urltext(url)): + if not emails.has_key(email): + if('reedmidem.com' in email): + continue + else: + email_link.append(email) + except: + continue return email_link def put_email_to_file(self,email):