threataggregator.py

#!/bin/python
#
#
# The MIT License (MIT)
#
# Copyright (c) 2016 JP Senior jp.senior@gmail.com
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# "This product includes GeoLite data created by MaxMind, available from
# http://maxmind.com/"
#
# Additional thanks to Seth Golub for heatmap - a fun little diversion with the resultant data.
# Copyright 2010 Seth Golub
# http://www.sethoscope.net/heatmap/
#
#
# RepDB
# This program scrapes reputation database information from a number of internet sources
# and creates CEF entries to update, add, or remove new, expired, and similar information
# from free threat sources on the Internet.
# These CEF entries are then forwarded via a simple syslog to a centralized server for later
# SIEM analysis (eg Splunk, Arcsight, Alienvault)
# Correlating these reputation entries agaisnt firewall, web proxy, IPS/IDS, etc logs enables
# an administrator to drill down into problem areas in their network.
#
# Some mechanisms are in place to assist with geolocation of asset information 
# unique IP addresses.
#

import sys
import difflib
import re
import os
import csv
import socket
import time
import datetime
import geoip2.database
import geoip2.errors
import gzip
import maxminddb.errors
import netaddr
import json
import requests

from feeds import feeds
import config

FACILITY = dict(kern=0, user=1, mail=2, daemon=3, auth=4, syslog=5, lpr=6, news=7, uucp=8, cron=9, authpriv=10, ftp=11,
                local0=16, local1=17, local2=18, local3=19, local4=20, local5=21, local6=22, local7=23)

LEVEL = dict(emerg=0, alert=1, crit=2, err=3, warning=4, notice=5, info=6, debug=7)

# Regular Expression for dotted-quad IP addresses with or without CIDR suffixes
re_ipcidr = (r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)'
             '{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])'
             '((/([0-9]|[1-2][0-9]|3[0-2]){0,2})?)')

if not re.match(re_ipcidr, config.host):
    raise Exception(ValueError, "Syslog host %s is not valid" % config.host)


def download_file(url, filename):
    """
    :param url: URL of file to download
    :param filename: Filename to write the result object to
    :return:
    """
    r = requests.get(url, stream=True, proxies=config.proxies, verify=config.verifySSL)

    with open(filename, 'wb') as fd:
        for chunk in r.iter_content(1024):
            fd.write(chunk)


class RepDB(list):
    """ Reputation database class to store entries

    """

    def __init__(self):
        super(RepDB, self).__init__()
        self.entries = []

    def add(self, ip, source, description, priority=1, reputation=1, latitude=0.000000, longitude=0.000000, city='',
            country=''):
        """

        :param string ip: IP is a dotted quad x.x.x.x or CIDR x.x.x.x/yy
        :param string source: Source for ThreatDB entry URL
        :param string description: Description of this individual entry
        :param int priority: Priority of threat
        :param int reputation: Reputation of threat
        :param float latitude: Latitude of threat entry
        :param float longitude: Longitude of threat entry
        :param string city: City of located threat
        :param string country: Country of located threat
        :return:
        """

        if not re.match(re_ipcidr, ip):
            # How did we get here?
            raise Exception(ValueError, "IP %s is not valid" % ip)

        try:
            reader = get_geo_db()

            # Use netaddr to convert CIDR to a network ID, allowing us to extract cities properly.
            response = reader.city(netaddr.IPNetwork(ip).network)
            if not city:
                city = response.city.name
            if not country:
                country = response.country.name
            if not latitude:
                if response.location.latitude:
                    latitude = response.location.latitude
                else:
                    latitude = 0
            if not longitude:
                if response.location.longitude:
                    longitude = response.location.longitude
                else:
                    longitude = 0
            # Close the GeoDB reader
            reader.close()

        # Not all IP addresses will be in the Maxmind database
        except geoip2.errors.AddressNotFoundError:
            pass

        except maxminddb.errors.InvalidDatabaseError as e:
            raise "Invalid GeoIP database %s" % e

        # Signed degrees format for latitude and longitude
        # Represented DDD.ddd with maximum 8 decimal places.
        # Longitudes range from -90 to 90
        # Latitudes range from -180 to 180
        if latitude < -90.0 or latitude > 90.0:
            latitude = 0
        if longitude < -180.0 or longitude > 180.0:
            longitude = 0
        # Translate CIDR to a list of IP addresses for Arcsight
        for i in netaddr.IPNetwork(ip):
            self.entries.append(
                {'ip': i, 'source': source, 'description': description, 'priority': priority,
                 'reputation': reputation, 'latitude': latitude, 'longitude': longitude, 'city': city,
                 'country': country})

    def __count__(self):
        """ Returns count of RepDB entries.

        :return:
        """
        return len(self)

    def __iter__(self):
        """ Custom iterator to use entries instead of the object itself
        :return:
        """
        for e in self.entries:
            # print("Entry: {}".format(e))
            yield e

    def __getitem__(self, item):
        """
        :param int item: Integer index of entry item
        :return: Returns selected item slice
        """
        return self.entries[item]

    def __len__(self):
        return len(self.entries)

    def search(self, ip, top=False):
        """ Allows you to search the reputation database on a destination IP address
            If found, returns a list of RepDB entries containing information about IP.
            Specifying top=true only returns the first entry

        :param ip: IP to search RepDB for
        :param BOOL top: Returns the 'first' entry if true or 'all' matching entries if false
        :return: Returns a list of results, or False if no results
        """
        results = []
        for entry in self:
            if netaddr.IPNetwork(ip).network in netaddr.IPNetwork(entry['ip']):
                if top:
                    results.append(entry)
                    return results
                results.append(entry)
        # list of results
        return results


class BuildCompare:
    """ Uses difflib.SequenceMatcher to compare list 'a' and list 'b' and return results accordingly
    c = buildcompare(list(a),list(b))
    c.add() returns a list of items 'new' to add
    c.delete() returns a list of items 'old' to remove
    end state is to send CEF-based syslog packets to Arcsight for adding and removing threat events from a feed
    """

    def __init__(self, old, new):
        """
        :param list old: List of 'old' lines to compare to new
        :param list new: List of 'new' lines to compare to old
        :return:
        """

        # Compares best when items are sorted
        old.sort()
        new.sort()
        self.add = []
        self.delete = []
        self.equal = []
        s = difflib.SequenceMatcher(None, old, new)
        for tag, i1, i2, j1, j2 in s.get_opcodes():

            # This helps to understand what we're adding and removing. From difflib documentation
            if config.debug:
                print("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % (tag, i1, i2, old[i1:i2], j1, j2, new[j1:j2]))

            # replace takes out items from list A[i1:i2] and adds from list B[j1:j2]
            if tag == 'replace':
                for i in old[i1:i2]:
                    self.delete.append(i)
                for i in new[j1:j2]:
                    self.add.append(i)
            # delete records are not seen in list b. Remove items from list a[i1:i2]
            elif tag == 'delete':
                for i in old[i1:i2]:
                    self.delete.append(i)
            # insert records are not seen in list a. Add items from list b.
            elif tag == 'insert':
                for i in new[j1:j2]:
                    self.add.append(i)
            elif tag == 'equal':
                for i in old[i1:i2]:
                    self.equal.append(i)

    def add(self):
        """ Returns a list of items to add

        :return: Returns a list of items to ADD
        """
        return self.add

    def delete(self):
        """ Returns a list of items to delete

        :return: Returns a list of items to delete
        """
        return self.delete

    def equal(self):
        """ Returns a list of unchanged items

        :return:Returns a list of unchanged items
        """
        return self.equal


def syslog(message):
    """ Send a UDP syslog packet

    :param string message: Sends a raw message to syslog
    :return:
    """
    level = LEVEL['info']
    facility = FACILITY['local0']

    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    # We have to encode as UTF8 for non-ascii characters.

    data = u'<%d>%s' % (level + facility * 8, message)
    s.sendto(data.encode('utf-8'),(config.host, config.port))
    s.close()


def get_geo_db():
    """ Finds and caches a maxmind database for GeoIP2
    from http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz

    :return: geoip2.database.Reader object
    """

    # Pull everything off the internet if it isn't cached
    geofilename = 'cache/GeoLite2-City.mmdb'
    url = 'http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz'
    gzipfile = 'cache/GeoLite2-City.mmdb.gz'

    if os.path.isfile(geofilename):
        try:
            reader = geoip2.database.Reader(geofilename)
        except ValueError as e:
            raise Exception("Error accessing GeoLite database: %s" % e)
        except maxminddb.errors.InvalidDatabaseError as e:
            raise Exception("Invalid DB error %s - %s " % (geofilename, e))
        return reader
    else:

        try:
            print("Maxmind database not cached. Attempting to pull from {0}".format(url))
            download_file(url, gzipfile)
        except requests.ConnectionError:
            e = sys.exc_info()[0]
            print('Connection interrupted while downloading Maxmind Database: {0} - {1}'.format(url, e))
        except IOError:
            e = sys.exc_info()[0]
            print('Error downloading Maxmind Database: %s - %s'.format(url, e))

        # Open Gzip
        f = gzip.open(gzipfile, 'rb')
        maxmind = f.read()
        f.close()

        # Re-Write uncompressed format
        f = open(geofilename, 'wb')
        f.write(maxmind)
        f.close()

        # Wrap up
        reader = geoip2.database.Reader(geofilename)
        return reader


def emergingthreat(url, data):
    """ Builds an emergingthreat.net specific Block IP list with special parsing mechanisms

    Emergingthreat.net Block-IP list is formatted in a special way so we have to parse it
    differently.  Each category (description) is separated by two whitespaces and a hash
    and entries following are individual reputation entries.
    EG:

    #header
    #header

    #Spam

    1.2.3.4
    2.3.4.5

    #Malware

    9.9.9.9
    10.10.10.10

    :param string url: URL for Emergingthreat.net to include in db entry
    :param list data: list() of lines to parse
    :return: RepDB: A RepDB() instance containing threat information
    """
    repdb = RepDB()
    re_section = r'^#(.*)'
    iptype = ''
    for line in data:
        typematch = re.match(re_section, line)
        ipmatch = re.match(re_ipcidr, line)
        if typematch:
            # Get rid of extra whitespace. Match group '1'.
            iptype = ' '.join(typematch.group(1).split())
        elif ipmatch:
            # Spamhaus are too big and too annoying.  They break RepDB later when we parse out CIDR
            if iptype != 'Spamhaus DROP Nets':
                ip = ipmatch.group(0)
                repdb.add(ip, url, iptype)
    return repdb


def ipfeed(url, description, data):
    """ Builds reputation DB based on one IP per line
    Only imports valid IPs

    Format is one IP per line with no further details. EG:

    1.2.3.4
    3.4.5.2
    9.9.9.9

    :param string url: URL for generic IP feed to include in DB entry
    :param string description: Description of DB entry
    :param list data: List of lines to parse
    :return: RepDB: A RepDB() instance containing threat information
    """
    repdb = RepDB()
    for line in data:
        ipmatch = re.match(re_ipcidr, line)
        if ipmatch:
            ip = ipmatch.group(0)
            repdb.add(ip, url, description)
    return repdb


def sslblacklist(url, data):
    """ Parse SSLBlacklist CSV entries
    Format is:
    ip,port,description


    :param string url: URL for generic IP feed to include in DB entry
    :param list data: List of lines to parse
    :return:RepDB: A RepDB() instance containing threat information
    """
    repdb = RepDB()
    reader = csv.reader(data, delimiter=',')

    for line in reader:
        ipmatch = re.match(re_ipcidr, line[0])
        if ipmatch:
            ip = ipmatch.group(0)
            repdb.add(ip, url, line[2])
    return repdb


def autoshun(url, data):
    """ Parse Autoshun CSV entries
    Format is:
    ip,port,description


    :param string url: URL for generic IP feed to include in DB entry
    :param list data: List of lines to parse
    :return: RepDB: A RepDB() instance containing threat information
    """
    repdb = RepDB()
    reader = csv.reader(data, delimiter=',')

    for line in reader:
        ipmatch = re.match(re_ipcidr, line[0])
        if ipmatch:
            ip = ipmatch.group(0)
            repdb.add(ip, url, line[2])
    return repdb


def alienvault(url, data):
    """ Parse alienvault reputation db entries. These are pretty complicated so a simpler parser is used.

    Format is:
    #<IP>#<PRIORITY>#<CONFIDENCE>#<Description>#<COUNTRY>#<CITY>#<LATITUDE>,<LONGITUDE>#??

    :param string url: URL for generic IP feed to include in DB entry
    :param list data: List of lines to parse
    :return: RepDB: A RepDB() instance containing threat information
    """
    repdb = RepDB()

    def check_reputation_format(ln):
        r = re.compile("^[+-]?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}#\d\d?#\d\d?#.*#.*#.*#.*#.*$")
        if ln != "":
            if not r.match(ln):
                return False
        return True

    for d in data:
        if check_reputation_format(d) and d != "":
            if d[0] == "-":
                continue
            if d[0] == "+":
                d = d[1:]
            fs = d.split("#")
            if len(fs) == 8:
                # Check parameters
                # Some variables are unsed; Alienvault includes its own repDB entry for maxmind lookups
                # but we prefer to do it ourselves
                ip = fs[0]
                rel = int(fs[1])
                prio = int(fs[2])
                desc = fs[3]

                repdb.add(ip, url, desc, priority=prio, reputation=rel)
    return repdb


def build_db(dbtype, url, description, db_add, db_del, db_equal):
    """ Builds reputation database entry based on type
    Assumes default type 'ipfeed'

    :param string dbtype: User-specified 'type' for feed name. Constructs filename
    :param string url: URLLib http GET url to obtain threat entries
    :param string description: User description of threat feed
    :param db_add: Entry database for 'new' items
    :param db_del Entry database for 'new' items
    :param db_equal: Entry database for 'new' items
    :return:
    """

    old_filename = 'cache/%s.txt' % dbtype
    new_filename = 'cache/%s.txt.compare_add' % dbtype

    if not os.path.exists('cache'):
        os.makedirs('cache')

    try:
        download_file(url, new_filename)
    except requests.ConnectionError as e:
        print('Connection interrupted while downloading: {0} - {1}'.format(url, e))
        # If there's a problem just keep going.
        return

    except IOError:
        e = sys.exc_info()[0]
        print('Error downloading: {0} - {1}'.format(url, e))
        raise IOError('Something happened {0}'.format(e))

    if os.path.isfile(new_filename):
        with open(new_filename, 'r') as fn:
            compare_add = fn.read().splitlines()
    else:
        compare_add = []

    if os.path.isfile(old_filename):
        with open(old_filename, 'r') as fn:
            compare_delete = fn.read().splitlines()
    else:
        compare_delete = []
    print('Comparing {0} downloaded to {1} cached lines'.format(len(compare_add), len(compare_delete)))

    compare = BuildCompare(compare_delete, compare_add)
    compare_delete = compare.delete
    compare_add = compare.add
    compare_equal = compare.equal
    print("{0} new, {1} deleted, {2} unchanged lines".format(len(compare_add), len(compare_delete),
                                                             len(compare_equal)))

    if dbtype == 'alienvault':
        db_del.append(alienvault(url, compare_delete))
        db_add.append(alienvault(url, compare_add))
        db_equal.append(alienvault(url, compare_equal))
    elif dbtype == 'emerging-block':
        db_del.append(emergingthreat(url, compare_delete))
        db_add.append(emergingthreat(url, compare_add))
        db_equal.append(emergingthreat(url, compare_equal))

    elif dbtype == 'ssl-blacklist':
        db_del.append(sslblacklist(url, compare_delete))
        db_add.append(sslblacklist(url, compare_add))
        db_equal.append(sslblacklist(url, compare_equal))
    elif dbtype == 'ssl-blacklist':
        db_del.append(autoshun(url, compare_delete))
        db_add.append(autoshun(url, compare_add))
        db_equal.append(autoshun(url, compare_equal))
    else:
        db_del.append(ipfeed(url, description, compare_delete))
        db_add.append(ipfeed(url, description, compare_add))
        db_equal.append(ipfeed(url, description, compare_equal))

    if not os.path.exists('cache'):
        os.makedirs('cache')

    if os.path.isfile(old_filename):
        try:
            os.remove(old_filename)
        except (IOError, OSError) as e:
            raise OSError('Could not remove file: {0}- {1}'.format(old_filename, e))
    try:
        os.rename(new_filename, old_filename)
    except (IOError, OSError) as e:
        raise OSError('Could not rename {0} to {1} - {2}'.format(new_filename, old_filename, e))


def printjson(action, entry):
    """ Prints a JSON-formatted object for an action and entry
   
    :param string action:  add remove or delete
    :param entry: One RepDB entry to print JSON output for
    :return: null
    """
    outjson = json.dumps({
        action: {
            'ip': str(entry['ip']),
            'source': entry['source'],
            'description': entry['description'],
            'priority': entry['priority'],
            'reputation': entry['reputation'],
            'city': entry['city'],
            'country': entry['country'],
            'latitude': entry['latitude'],
            'longitude': entry['longitude'],
        }
    })
    print(outjson)


def buildcef(action, entry):
    """ Builds a CEF-formatted string based on reputation entry from RepDB

    :param string action:  add remove or delete
    :param entry: One RepDB entry to parse
    :return: Returns a CEF-formatted string with timestamp
    """
    ip = entry['ip']
    source = entry['source']
    description = entry['description']
    priority = entry['priority']
    reputation = entry['reputation']
    city = entry['city']
    country = entry['country']
    latitude = entry['latitude']
    longitude = entry['longitude']

    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%b %d %Y %H:%M:%S')
    return ('%s %s CEF:0|%s|%s|1.0|100|Threat Entry %s|1|act=%s reason=%s src=%s '
            'cs1Label=Source cs1=%s cs2Label=City cs2=%s cs3Label=Country cs3=%s '
            'cfp1Label=Latitude cfp1=%.8f cfp2Label=Longitude cfp2=%.8f cfp3Label=Priority '
            'cfp3=%d cfp4Label=Reputation cfp4=%d') % (
               timestamp, config.deviceHost, config.deviceVendor, config.deviceProduct, action, action,
               description, ip, source, city, country, latitude, longitude, priority, reputation
           )


def start(feedlist, db_add, db_del, db_equal):
    """ Begins scraping URLs and building reputation DB entities.
    :param feedlist: list of dictionary elements containing type, url, description
    :param db_add: RepDB of new entries
    :param db_del: RepDB of removed entries
    :param db_equal: RepDB of unchanged entries (updated)
    :return:
    """
    for i in feedlist:
        print("Processing {0} from {1}".format(i['description'], i['url']))
        build_db(i['type'], i['url'], i['description'], db_add, db_del, db_equal)


def process(db_add, db_del, db_equal):
    """ Processes RepDB entries in order for syslog, stdout, csv file, etc

    :param repDB db_add: RepDB entry to show added items
    :param repDB db_del: RepDB entry to show deleted items
    :param repDB db_equal: RepDB entry to show unchanged values
    """

    # fun toy for heatmaps later
    f = open('cache/coords.txt', 'w')
    count_add = 0
    count_del = 0
    count_equal = 0

    for line in db_add:

        for i in line:
            count_add += 1
            msg = buildcef('add', i)
            syslog(msg)
            if config.debug:
                printjson('add', i)
            f.write("%s %s\n" % (i['latitude'], i['longitude']))

    for line in db_del:
        for i in line:
            count_del += 1
            msg = buildcef('delete', i)
            if config.debug:
                printjson('delete', i)
            syslog(msg)

    for line in db_equal:
        for i in line:
            count_equal += 1
            msg = buildcef('update', i)
            syslog(msg)
            if config.debug:
                printjson('update', i)
            f.write("%s %s\n" % (i['latitude'], i['longitude']))

    f.close()
    print("Sent {0} New, {1} deleted, and {2} unchanged entries to {3}:{4}".format(
        count_add, count_del, count_equal, config.host, config.port))

# Only run code if invoked directly: This allows a user to import modules without having to run through everything
if __name__ == "__main__":
    _db_add = []
    _db_del = []
    _db_equal = []

    start(feeds, _db_add, _db_del, _db_equal)
    process(_db_add, _db_del, _db_equal)