email_extractor.py

# -*- coding: utf-8 -*-
#! /usr/bin/env python
__thisfile__ = "http://www.jaist.ac.jp/~s1010205/email_extractor/email_extractor.py"
"""
    Web Data Extractor, extract emails by sitecrawl
    Copyright (C) 2011 KATHURIA Pulkit
    Contact: pulkit@jaist.ac.jp

    Contributors:
        Open Source Sitemap Generator sitemap_gen by Vladimir Toncar
        http://toncar.cz/opensource/sitemap_gen.html

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
import sys
import re
import commands
from urllib import urlopen
from collections import defaultdict
import argparse
import string
import urllib2
import urlparse
from HTMLParser import HTMLParser
from HTMLParser import HTMLParseError
import robotparser
import httplib

def getPage(url):
    try:
        f = urllib2.urlopen(url)
        page = ""
        for i in f.readlines():
            page += i
        date = f.info().getdate('Last-Modified')
        if date == None:
            date = (0, 0, 0)
        else:
            date = date[:3]
        f.close()
        return (page, date, f.url)
    except (urllib2.URLError,httplib.IncompleteRead):
        pass
        return (None, (0,0,0), "")

def joinUrls(baseUrl, newUrl):
	helpUrl, fragment = urlparse.urldefrag(newUrl)
        return urlparse.urljoin(baseUrl, helpUrl)

def getRobotParser(startUrl):
	rp = robotparser.RobotFileParser()
	robotUrl = urlparse.urljoin(startUrl, "/robots.txt")
	page, date, url = getPage(robotUrl)

	if page == None:
	    return None
	rp.parse(page)
	return rp

class MyHTMLParser(HTMLParser):
    def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
        HTMLParser.__init__(self)
        self.pageMap = pageMap
	self.redirects = redirects
        self.baseUrl = baseUrl
        self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5
        self.maxUrls = maxUrls
        self.blockExtensions = blockExtensions
	self.robotParser = robotParser
    def hasBlockedExtension(self, url):
        p = urlparse.urlparse(url)
        path = p[2].upper() # path attribute
        for i in self.blockExtensions:
            if path.endswith(i):
                return 1
        return 0
    def handle_starttag(self, tag, attrs):
        if len(self.pageMap) >= self.maxUrls:
            return
        if (tag.upper() == "BASE"):
	    if (attrs[0][0].upper() == "HREF"):
		self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
        if (tag.upper() == "A"):
            url = ""
	    for attr in attrs:
                if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
                    return  
                elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
                    url = joinUrls(self.baseUrl, attr[1])
            if url == "": return
            if urlparse.urlsplit(url)[1] <> self.server:
                return
            if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
                return
            if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)):
                return
            if not(self.pageMap.has_key(url)):
                self.pageMap[url] = ()

def getUrlToProcess(pageMap):
    for i in pageMap.keys():
        if pageMap[i] == ():
            return i
    return None

def parsePages(startUrl, maxUrls, blockExtensions):
    pageMap = {}
    pageMap[startUrl] = ()
    redirects = []
    robotParser = getRobotParser(startUrl)
    while True:
        url = getUrlToProcess(pageMap)
        if url == None:
            break
        #print " ", url
        page, date, newUrl = getPage(url)
        if page == None:
            del pageMap[url]
	elif url != newUrl:
	    #print newUrl
            del pageMap[url]
	    pageMap[newUrl] = ()
	    redirects.append(url)
        else:
            pageMap[url] = date
            parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
            try:
                parser.feed(page)
                parser.close()
            except HTMLParseError:
                pass
            except UnicodeDecodeError:
                pass
    return pageMap

def grab_email(text):
    found = []
    mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
    for line in text:
        found.extend(mailsrch.findall(line))    
    u = {}
    for item in found:
        u[item] = 1
    return u.keys()

def urltext(url):
    viewsource = urlopen(url).readlines()
    return viewsource

def crawl_site(url, limit):
    return parsePages(url, limit, 'None')
'''
if __name__ == '__main__':
    parser = argparse.ArgumentParser(add_help = True)
    parser = argparse.ArgumentParser(description= 'Web Email Extractor')
    parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl')
    parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com')
    myarguments = parser.parse_args()
    emails = defaultdict(int)
    for url in crawl_site(myarguments.url, myarguments.limit):
        for email in grab_email(urltext(url)):
            if not emails.has_key(email): print email
            emails[email] += 1

 '''