#!/usr/bin/python
import sys
import getopt
import datetime

#
# Parse Nepenthes logged_submissions file (from standardin) and generate statistics
# 	Version: 0.5 (WIP)
# 	Date:	 2009-05-17	
#
# Author: Andrew Waite (aka RoleReversal)
# http://www.infosanity.co.uk
#

# Change Log:
#	Version 0.1 (2009-05-17):
#		Import logged_submissions from stdin
#
#	Version 0.2 (2009-05-17):
#		Added support for commandline control via getopt
#
#	Version 0.3 (2009-05-17):
#		Added basic HTML output of statistics
#
#	Version 0.4 (2009-05-17):
#		Added additional statistics:
#			first/last date in log
#			number of days
#			avg submissions per day
#
#	Version 0.5 (2009-05-17):
#		Refactored statistic generation into stats class
#		Added output of 'recent' malware samples
#		update HTML to include all stats
#		Cleaned up help and version output
	
#Global vars
output = False
verbose = False

#class stores logged submission entries
class Submission:
	def __init__( self, date, time, sourceIP, sourceURL, malwareMD5):
		self.date = date
		self.time = time
		self.sourceIP = sourceIP
		self.sourceURL = sourceURL
		self.malwareMD5 = malwareMD5

	def out(self):
#		print '%s,%s,%s,%s,%s' % (self.date, self.time, self.sourceIP, self.sourceURL, self.malwareMD5)	 
		str = self.date + ', ' + self.time + ', ' + self.sourceIP + ', ' + self.sourceURL + ', ' + self.malwareMD5
		return str

#/End class def

#class  generates and stores submission stats
class Stats:
	def __init__(self):
		self.submissions = 0
		self.samples = 0
		self.sourceIPs = 0
		self.firstDate = None
		self.lastDate = None
		self.submissionList = []

	def incNumSubmissions(self):
		self.submissions += 1
	
	def getNumSubmissions(self):
		return (self.submissions)

	def incNumSamples(self):
		self.samples += 1

	def getNumSamples(self):
		return (self.samples)

	def incNumSourceIPs(self):
		self.sourceIPs += 1

	def getNumSourceIPs(self):
		return (self.sourceIPs)

	#determine number of days spanned by submission file
	# (difference between last date and first date)
	def getNumDays(self):
		firstParts = self.firstDate.split('-')
		lastParts = self.lastDate.split('-')
		
		first = datetime.date( int(firstParts[0]), int(firstParts[1]), int(firstParts[2]) )
		last = datetime.date( int(lastParts[0]), int(lastParts[1]), int(lastParts[2]) )
		diff = last - first
		strDiff = str(diff)
		return strDiff.split(',')[0].split(' ')[0]

	def getAvgSubmissionsPerDay(self):
		return self.submissions / int(self.getNumDays())

	#Parses Nepenthes logged_submissions file from stdin
	def parseLogged_Submissions(self):
		#Create list of log entries from standard input
		while 1:
			#read stdin and break loop if EoF
			line = sys.stdin.readline()
			if not line:
				break

			#Splite input line to composite parts
			logData = line.split(' ');
			timestamp = logData[0].strip('[]')
			date = timestamp.split('T')[0]
			time = timestamp.split('T')[1]
			sourceIP = logData[1]
			sourceMalware = logData[4]
			malwareMD5 = logData[5]
		
			#create new Submission object with log line contents
			sub = Submission(date, time, sourceIP, sourceMalware, malwareMD5)
			self.submissionList.append(sub)	

			#update statistics
			self.incNumSubmissions()
			if self.firstDate:
				pass
			else:
				self.firstDate = sub.date
			self.lastDate = sub.date
		

	def generateStats(self):
		uniqueSamples = []
		sourceIPs = []

		#iterate through submissions and create stats
		for entry in self.submissionList:
			#Handle log MD5 hash
			if entry.malwareMD5 in uniqueSamples:
				pass
			else:
				uniqueSamples.append(entry.malwareMD5)
				self.incNumSamples()
		
			#Handle log source IP
			if entry.sourceIP in sourceIPs:
				pass
			else:
				sourceIPs.append(entry.sourceIP)
				self.incNumSourceIPs()

	#Returns 'entries' most recent log lines (similar to tail)
	# as list of submission objects
	def getRecent(self, entries=5):
		recent = []
		for i in range( 1, ( entries + 1 ) ):
			recent.append( self.submissionList[self.submissions - i] )
		
		return recent

	def out(self):
		if output == "html":
			sys.stdout.write("<html>\n")
			sys.stdout.write("<head>\n")
			sys.stdout.write("<title>Nepenthes Stats - By InfoSanity.co.uk</title>\n")
			sys.stdout.write("<body>\n")
			sys.stdout.write("<H1>Nepenthes Honeypot statistics</H1>\n")
#			sys.stdout.write("<H2>Created %s</H2>\n" %(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			sys.stdout.write("Total number of submissions: %i</br>\n" %(self.getNumSubmissions()))
			sys.stdout.write("Number of unique* malware samples: %i</br>\n" %(self.getNumSamples()))
			sys.stdout.write("Number of unique source IP address: %i</br>\n" %(self.getNumSourceIPs()))
			sys.stdout.write("</br>\n")
			sys.stdout.write("Earliest sample seen: %s</br>\n" %(self.firstDate))
			sys.stdout.write("Latest sample seen: %s</br>\n" %(self.lastDate))
			sys.stdout.write("Days running: %s</br>\n" %(self.getNumDays()))
			sys.stdout.write("Average number of daily submissions: %s</br>\n" %(self.getAvgSubmissionsPerDay()))
			sys.stdout.write("</br>\n")

			sys.stdout.write("Most recent submissions:</br>\n")
			recentSubmissions = self.getRecent()
			for sub in recentSubmissions:
				sys.stdout.write("%s</br>" %(sub.out()))

			sys.stdout.write("</br>")
			sys.stdout.write("N.B. Unique samples based on unique md5 hashes</br>\n")
			sys.stdout.write("Statistics engine by Andrew Waite - <a href=http://www.infosanity.co.uk>Infosanity</a>\n")
			sys.stdout.write("</body>\n")
			sys.stdout.write("</html>\n")
		else:
			sys.stdout.write("Statistics engine written by Andrew Waite - www.InfoSanity.co.uk\n\n")
			sys.stdout.write("Number of submissions: %i\n" %(self.getNumSubmissions()))
			sys.stdout.write("Number of unique samples: %i\n" %(self.getNumSamples()))
			sys.stdout.write("Number of unique source IPs: %i\n" %(self.getNumSourceIPs()))

			sys.stdout.write("\n")
			sys.stdout.write("First sample seen on %s\n" %(self.firstDate))
			sys.stdout.write("Last sample seen on %s\n" %(self.lastDate))
			sys.stdout.write("Days running: %s\n" %(self.getNumDays()))
			sys.stdout.write("Average daily submissions: %s\n" %(self.getAvgSubmissionsPerDay()))

			sys.stdout.write("\n")
			sys.stdout.write("Most recent submissions:\n")
			recentSubmissions = self.getRecent()
			for sub in recentSubmissions:
				sys.stdout.write( "\t %s" %(sub.out()) )
#/End class def

#parse commandline options
def parseOpts():
	#Tutorial at http://docs.python.org/library/getopt.html used as basis
	try:
		opts, args = getopt.getopt(sys.argv[1:], "ho=vV", ["help", "output=", "Version"])
	except getopt.GetoptError, err:
		print str(err)
		usage()
		sys.exit(2)

	global output
	verbose = False
	for o, a in opts:
		if o == "-v":
			verbose = True
		elif o in ("-h", "--help"):
			usage()
			sys.exit()
		elif o in ("-o", "--output"):
			output = a
		elif o in ("-V", "--version"):
			version()
			sys.exit()
		else:
			assert False, "unhandled option"

def usage():
	sys.stdout.write("Parses Nepenthes logged_submissions file (read from stdin) to generate statistics\n")
	sys.stdout.write("Written by Andrew Waite, www.infosanity.co.uk\n")
	sys.stdout.write("\n")
	sys.stdout.write("Typical usage:\n")
	sys.stdout.write("\tcat /var/log/nepenthes/logged_submissions | ./submissions2stats.py\n")
	sys.stdout.write("\n")
	sys.stdout.write("Options:\n")
	sys.stdout.write("\t -h, --help \t\t You're reading it\n")
	sys.stdout.write("\t -V, --version \t\t Display version info\n")
	sys.stdout.write("\t --output=[html] \t Force HTML output for web presentation, defualt is text\n")

def version():
	sys.stdout.write("submissions2stats.py version 0.5\n")
	sys.stdout.write("--\n")
	sys.stdout.write("Andrew Waite")
	sys.stdout.write("http:\\www.infosanity.co.uk\n")
	

def main():
	parseOpts()
	stats = Stats()
	stats.parseLogged_Submissions()
	stats.generateStats()
	stats.out()

if __name__ == "__main__":
	main()
