Created
June 25, 2015 16:42
-
-
Save shoaibi/9c5a08ef6bf49e71ce32 to your computer and use it in GitHub Desktop.
A small little thttpd log parser script I did as an assignment.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# My very first python script :) | |
""" Pre-res: | |
require python-sqlite | |
""" | |
""" Possible Improvments | |
Support for long arguments switches | |
""" | |
import sqlite3, os, sys, getopt, time, datetime, tempfile | |
# Global variables | |
callDictionary = None | |
callIndex = None | |
class parser: | |
# class level variables used throughout the query functions | |
limit = None | |
hours = None | |
days = None | |
lastEpoch = 1234 | |
conn = None | |
cursor = None | |
# initlizations | |
def __init__(self, serverLog, db): | |
global callDictionary | |
try: | |
# initialize call dictionary | |
callDictionary = { 'gMCBIP' : self.getMostConnectionsByIP, 'gMSCBIP': self.getMostSuccessfulConnectionsByIP, 'gMCRCWIP' : self.getMostCommonResultCodesWithIPs, 'gMCERCWIP' : self.getMostCommonErrorResultCodesWithIPs , 'gMBSTWIP' : self.getMostBytesSentToWithIPs} | |
# connect to database | |
self.conn = sqlite3.connect(db) | |
self.cursor = self.conn.cursor() | |
# creating the table that keep track of files with mtime to not keep populating db again and again for same file with no change | |
self.cursor.execute('create table if not exists logfiles (path, mtime)') | |
stats = os.stat(serverLog) | |
#print 'Modification time:',stats.st_mtime | |
filePath = os.path.abspath(serverLog) | |
#print 'File Path: ' , filePath | |
self.cursor.execute('select count(path) from logfiles where path=? and mtime=?', (filePath, stats.st_mtime)) | |
data = self.cursor.fetchall()[0][0] | |
#print data | |
if data == 0 : | |
""" Ok, seems like | |
- This log file wasn't used with the script before | |
- Someone changed something in log file | |
- Some deleted the db file or modified the logfiles table | |
so now we need to re-populate the db. | |
""" | |
print 'Populating Database' | |
print 'Depending upon the size of log file this could take several minutes' | |
self.cursor.execute('drop table if exists log') | |
self.cursor.execute('create table log (id, ip, username, date, time, gmt, epoch, method, urlrequested, statuscode, bytes, referer, useragent)') | |
self.conn.commit() | |
numLog = len(open(serverLog).readlines()) | |
recordId = 0 | |
for line in open(serverLog, 'r'): | |
#print 'Inserting record id # ' , recordId | |
#print 'Line' | |
#print line | |
if line.strip() == '': | |
continue | |
data = [] | |
a = line.split('"') | |
#print a | |
line = line.split() | |
#print 'Line after split on space' | |
#print line | |
data.append(recordId) #record number | |
data.append(line[0]) #ip | |
data.append(line[2]) #username | |
data.append(line[3][1:line[3].index(":")]) #date | |
data.append(line[3][line[3].index(":") + 1:]) #time | |
data.append(line[4][:-1]) #gmt | |
data.append(self.getEpochCustom(line[3][1:])) #epoch | |
data.append(line[5][1:]) #method | |
data.append(line[6]) #urlrequested | |
data.append(line[8]) #statuscode | |
data.append(line[9]) #bytes | |
data.append(line[10][1:-1]) #referer | |
data.append(a[-2]) #useragent | |
#print 'Data' | |
#print data | |
self.cursor.execute('insert into log values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11], data[12])) | |
self.conn.commit() | |
recordId += 1 | |
# once the data has been parsed and saved in db, log the filename with mtime in db for saving time in future | |
self.cursor.execute('delete from logfiles') | |
self.cursor.execute('insert into logfiles values(?, ?)', (filePath, stats.st_mtime)) | |
self.conn.commit() | |
print 'Database table populated with latest log entries from log file specified' | |
# get the lastEpoch from db | |
self.cursor.execute('select max(epoch) from log') | |
self.lastEpoch = self.cursor.fetchall()[0][0] | |
# intentionally not closing connection here, it would be closed by the specific functions | |
except sqlite3.Error, error: | |
print str(error) | |
exit() | |
# get epoch from thttpd date time format | |
def getEpochCustom(self, dateTime): | |
#print 'got ' + dateTime | |
dateFormat = '%d/%b/%Y:%H:%M:%S' | |
timeTuple = time.strptime(dateTime, dateFormat) | |
return self.getEpoch(timeTuple) | |
#get epoch from standard datetime | |
def getEpoch(self, dateTime): | |
return time.mktime(dateTime) | |
# Return the epoch for lookup | |
def getEpochwithDifference( self, value, isHours): | |
value = value * 60 * 60 | |
if isHours == False : | |
value = value * 24 | |
return self.lastEpoch - value | |
# Sets criteria such as limit of records returned, hours and days limits to consider | |
def setCriteria(self, limit, hours, days): | |
self.limit = limit | |
self.hours = hours | |
self.days = days | |
self.sanityChecks() | |
# Checks critera values for sanity | |
def sanityChecks(self): | |
error = 0 | |
if self.limit < 1 and self.limit != None : | |
print 'Invalid value specified for limit' | |
error = True | |
if (self.hours > 23 or self.hours < 1) and self.hours != None : | |
print 'Invalid value specified for hours' | |
error = True | |
if self.days < 1 and self.days != None : | |
print 'Invalid value specified for days' | |
error = True | |
if error == True : | |
exit() | |
# returns the sql critera strings against the time ones set already | |
def getTimeCriteria(self): | |
criteria = '' | |
epochSearch = self.lastEpoch+1 | |
if self.hours != None : | |
epochSearch = self.getEpochwithDifference(self.hours , True) | |
elif self.days != None : | |
epochSearch = self.getEpochwithDifference(self.days , False) | |
criteria += ' epoch < ' + str(epochSearch) | |
return criteria | |
# returns the sql criteria strings against limit | |
def getLimitCriteria(self): | |
if self.limit != None : | |
return ' LIMIT 0,'+ str(self.limit) | |
return '' | |
# prints most Connections with IPs | |
def getMostConnectionsByIP(self, successfulOnly = False): | |
statusCriteria = '' | |
if successfulOnly == True : | |
statusCriteria = ' statuscode = \'200\' AND ' | |
query = 'SELECT ip, COUNT(ip) AS connections FROM log WHERE ' + statusCriteria + self.getTimeCriteria() | |
query += ' GROUP BY ip ORDER BY connections DESC ' + self.getLimitCriteria() | |
#print query | |
self.cursor.execute(query) | |
self.printNice(self.cursor.fetchall()) | |
self.closeDb() | |
# prints most Successful connections with IPs | |
def getMostSuccessfulConnectionsByIP(self): | |
self.getMostConnectionsByIP(True) | |
# prints most common results codes with IPs | |
def getMostCommonResultCodesWithIPs(self, errorOnly = False): | |
statusCriteria = '' | |
if errorOnly == True : | |
statusCriteria = ' statuscode LIKE \'4%\' OR statuscode LIKE \'5%\' AND ' | |
query = 'SELECT ip, statuscode FROM log WHERE ' + statusCriteria + self.getTimeCriteria() | |
query += ' GROUP BY ip HAVING COUNT(statuscode)>0 ORDER BY statuscode DESC ' + self.getLimitCriteria() | |
#print query | |
self.cursor.execute(query) | |
self.printNice(self.cursor.fetchall()) | |
self.closeDb() | |
# prints most Common error result code with IPs | |
def getMostCommonErrorResultCodesWithIPs(self): | |
self.getMostCommonResultCodesWithIPs(True) | |
# prints most bytes sent to an IP | |
def getMostBytesSentToWithIPs(self): | |
query = 'SELECT ip, SUM(bytes) AS data FROM log WHERE ' + self.getTimeCriteria() | |
query += ' GROUP BY ip HAVING data > 0 ORDER BY data DESC ' + self.getLimitCriteria() | |
#print query | |
self.cursor.execute(query) | |
self.printNice(self.cursor.fetchall()) | |
self.closeDb() | |
# prints output | |
def printNice(self, data) : | |
for ip, value in data: | |
print '%s \t\t=>\t\t %s ' % ( ip, value) | |
# closes database | |
def closeDb(self): | |
self.cursor.close() | |
self.conn.close() | |
# calls the function required | |
def fire(self): | |
global callIndex | |
global callDictionary | |
callDictionary[callIndex]() | |
def usage(): | |
print '\n\tUsage:' | |
print '\tlog_sum.py [-n N] [-h H|-d D] [-c|-2|-r|-F|-t] <filename>' | |
def qualiferUsage(): | |
print '\tError: Argument collision or Invalid number of arguments specified' | |
usage() | |
exit() | |
# Sets callIndex later used to determine which function to fire | |
def setCallIndex(opt): | |
global callIndex | |
global callDictionary | |
# if callIndex is already set then user must have passed two from the [-c|-2|-r|-F|-t] | |
if callIndex != None : | |
qualiferUsage() | |
# assign approperiate Index crossponding to CallDictionary | |
if opt == '-c': | |
callIndex = 'gMCBIP' | |
elif opt == '-2': | |
callIndex = 'gMSCBIP' | |
elif opt == '-r': | |
callIndex = 'gMCRCWIP' | |
elif opt == '-F': | |
callIndex = 'gMCERCWIP' | |
elif opt == '-t': | |
callIndex = 'gMBSTWIP' | |
def main(argv): | |
log = None | |
db = os.path.dirname(os.path.abspath(__file__)) + '/log.db' | |
limit = None | |
hours = None | |
days = None | |
tmpLogFile = None | |
global callIndex | |
global callDictionary | |
# get the commandline args | |
try: | |
opts, args = getopt.getopt(argv, "n:h:d:c2rFt", []) # pass the comma separate list of long argument switches if required | |
# unsupported arg supplied | |
except getopt.GetoptError: | |
qualiferUsage() | |
for opt, arg in opts: | |
if opt == '-n': | |
limit = arg | |
elif opt == '-h': | |
hours = arg | |
elif opt == '-d': | |
days = arg | |
elif opt in ('-c', '-2', '-r', '-F', '-t'): | |
setCallIndex(opt) | |
# user either didn't specify any of the output qualifiers or specified day as well as hour | |
if callIndex == None or (hours != None and days != None) : | |
qualiferUsage() | |
# get the log file name specified at the end | |
log = ''.join(args) | |
# if log file name is empty, complain about it | |
if log.strip() == '': | |
qualiferUsage() | |
# just some debugging information | |
#print '\tSource: ' + str(log) | |
#print '\tLimit: ' + str(limit) | |
#print '\tHours: ' + str(hours) | |
#print '\tDays: ' + str(days) | |
#print '\tcallIndex: ' + str(callIndex) | |
#print '\tlog: ' + str(log) | |
# if user specific -, read from stdin | |
if log.strip() == '-': | |
# reading data from user and saving in a temp file | |
userInputData = sys.stdin.read() | |
tmpLogFile = tempfile.NamedTemporaryFile() | |
log = tmpLogFile.name | |
tmpLogFile.write(userInputData) | |
# start the real work here | |
p = parser(log, db) | |
p.fire() | |
# making sure that tmp file gets deleted at the end | |
if tmpLogFile != None : | |
tmpLogFile.close() | |
# send args to main, don't send first e.g. index 0 as thats script name | |
if __name__ == "__main__": | |
main(sys.argv[1:]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment