Skip to content

Instantly share code, notes, and snippets.

@shoaibi
Created June 25, 2015 16:42
Show Gist options
  • Save shoaibi/9c5a08ef6bf49e71ce32 to your computer and use it in GitHub Desktop.
Save shoaibi/9c5a08ef6bf49e71ce32 to your computer and use it in GitHub Desktop.
A small little thttpd log parser script I did as an assignment.
#!/usr/bin/python
# My very first python script :)
""" Pre-res:
require python-sqlite
"""
""" Possible Improvments
Support for long arguments switches
"""
import sqlite3, os, sys, getopt, time, datetime, tempfile
# Global variables
callDictionary = None
callIndex = None
class parser:
# class level variables used throughout the query functions
limit = None
hours = None
days = None
lastEpoch = 1234
conn = None
cursor = None
# initlizations
def __init__(self, serverLog, db):
global callDictionary
try:
# initialize call dictionary
callDictionary = { 'gMCBIP' : self.getMostConnectionsByIP, 'gMSCBIP': self.getMostSuccessfulConnectionsByIP, 'gMCRCWIP' : self.getMostCommonResultCodesWithIPs, 'gMCERCWIP' : self.getMostCommonErrorResultCodesWithIPs , 'gMBSTWIP' : self.getMostBytesSentToWithIPs}
# connect to database
self.conn = sqlite3.connect(db)
self.cursor = self.conn.cursor()
# creating the table that keep track of files with mtime to not keep populating db again and again for same file with no change
self.cursor.execute('create table if not exists logfiles (path, mtime)')
stats = os.stat(serverLog)
#print 'Modification time:',stats.st_mtime
filePath = os.path.abspath(serverLog)
#print 'File Path: ' , filePath
self.cursor.execute('select count(path) from logfiles where path=? and mtime=?', (filePath, stats.st_mtime))
data = self.cursor.fetchall()[0][0]
#print data
if data == 0 :
""" Ok, seems like
- This log file wasn't used with the script before
- Someone changed something in log file
- Some deleted the db file or modified the logfiles table
so now we need to re-populate the db.
"""
print 'Populating Database'
print 'Depending upon the size of log file this could take several minutes'
self.cursor.execute('drop table if exists log')
self.cursor.execute('create table log (id, ip, username, date, time, gmt, epoch, method, urlrequested, statuscode, bytes, referer, useragent)')
self.conn.commit()
numLog = len(open(serverLog).readlines())
recordId = 0
for line in open(serverLog, 'r'):
#print 'Inserting record id # ' , recordId
#print 'Line'
#print line
if line.strip() == '':
continue
data = []
a = line.split('"')
#print a
line = line.split()
#print 'Line after split on space'
#print line
data.append(recordId) #record number
data.append(line[0]) #ip
data.append(line[2]) #username
data.append(line[3][1:line[3].index(":")]) #date
data.append(line[3][line[3].index(":") + 1:]) #time
data.append(line[4][:-1]) #gmt
data.append(self.getEpochCustom(line[3][1:])) #epoch
data.append(line[5][1:]) #method
data.append(line[6]) #urlrequested
data.append(line[8]) #statuscode
data.append(line[9]) #bytes
data.append(line[10][1:-1]) #referer
data.append(a[-2]) #useragent
#print 'Data'
#print data
#print
#print
self.cursor.execute('insert into log values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11], data[12]))
self.conn.commit()
recordId += 1
# once the data has been parsed and saved in db, log the filename with mtime in db for saving time in future
self.cursor.execute('delete from logfiles')
self.cursor.execute('insert into logfiles values(?, ?)', (filePath, stats.st_mtime))
self.conn.commit()
print 'Database table populated with latest log entries from log file specified'
# get the lastEpoch from db
self.cursor.execute('select max(epoch) from log')
self.lastEpoch = self.cursor.fetchall()[0][0]
# intentionally not closing connection here, it would be closed by the specific functions
except sqlite3.Error, error:
print str(error)
exit()
# get epoch from thttpd date time format
def getEpochCustom(self, dateTime):
#print 'got ' + dateTime
dateFormat = '%d/%b/%Y:%H:%M:%S'
timeTuple = time.strptime(dateTime, dateFormat)
return self.getEpoch(timeTuple)
#get epoch from standard datetime
def getEpoch(self, dateTime):
return time.mktime(dateTime)
# Return the epoch for lookup
def getEpochwithDifference( self, value, isHours):
value = value * 60 * 60
if isHours == False :
value = value * 24
return self.lastEpoch - value
# Sets criteria such as limit of records returned, hours and days limits to consider
def setCriteria(self, limit, hours, days):
self.limit = limit
self.hours = hours
self.days = days
self.sanityChecks()
# Checks critera values for sanity
def sanityChecks(self):
error = 0
if self.limit < 1 and self.limit != None :
print 'Invalid value specified for limit'
error = True
if (self.hours > 23 or self.hours < 1) and self.hours != None :
print 'Invalid value specified for hours'
error = True
if self.days < 1 and self.days != None :
print 'Invalid value specified for days'
error = True
if error == True :
exit()
# returns the sql critera strings against the time ones set already
def getTimeCriteria(self):
criteria = ''
epochSearch = self.lastEpoch+1
if self.hours != None :
epochSearch = self.getEpochwithDifference(self.hours , True)
elif self.days != None :
epochSearch = self.getEpochwithDifference(self.days , False)
criteria += ' epoch < ' + str(epochSearch)
return criteria
# returns the sql criteria strings against limit
def getLimitCriteria(self):
if self.limit != None :
return ' LIMIT 0,'+ str(self.limit)
return ''
# prints most Connections with IPs
def getMostConnectionsByIP(self, successfulOnly = False):
statusCriteria = ''
if successfulOnly == True :
statusCriteria = ' statuscode = \'200\' AND '
query = 'SELECT ip, COUNT(ip) AS connections FROM log WHERE ' + statusCriteria + self.getTimeCriteria()
query += ' GROUP BY ip ORDER BY connections DESC ' + self.getLimitCriteria()
#print query
self.cursor.execute(query)
self.printNice(self.cursor.fetchall())
self.closeDb()
# prints most Successful connections with IPs
def getMostSuccessfulConnectionsByIP(self):
self.getMostConnectionsByIP(True)
# prints most common results codes with IPs
def getMostCommonResultCodesWithIPs(self, errorOnly = False):
statusCriteria = ''
if errorOnly == True :
statusCriteria = ' statuscode LIKE \'4%\' OR statuscode LIKE \'5%\' AND '
query = 'SELECT ip, statuscode FROM log WHERE ' + statusCriteria + self.getTimeCriteria()
query += ' GROUP BY ip HAVING COUNT(statuscode)>0 ORDER BY statuscode DESC ' + self.getLimitCriteria()
#print query
self.cursor.execute(query)
self.printNice(self.cursor.fetchall())
self.closeDb()
# prints most Common error result code with IPs
def getMostCommonErrorResultCodesWithIPs(self):
self.getMostCommonResultCodesWithIPs(True)
# prints most bytes sent to an IP
def getMostBytesSentToWithIPs(self):
query = 'SELECT ip, SUM(bytes) AS data FROM log WHERE ' + self.getTimeCriteria()
query += ' GROUP BY ip HAVING data > 0 ORDER BY data DESC ' + self.getLimitCriteria()
#print query
self.cursor.execute(query)
self.printNice(self.cursor.fetchall())
self.closeDb()
# prints output
def printNice(self, data) :
print
for ip, value in data:
print '%s \t\t=>\t\t %s ' % ( ip, value)
# closes database
def closeDb(self):
self.cursor.close()
self.conn.close()
# calls the function required
def fire(self):
global callIndex
global callDictionary
callDictionary[callIndex]()
def usage():
print '\n\tUsage:'
print '\tlog_sum.py [-n N] [-h H|-d D] [-c|-2|-r|-F|-t] <filename>'
def qualiferUsage():
print '\tError: Argument collision or Invalid number of arguments specified'
usage()
exit()
# Sets callIndex later used to determine which function to fire
def setCallIndex(opt):
global callIndex
global callDictionary
# if callIndex is already set then user must have passed two from the [-c|-2|-r|-F|-t]
if callIndex != None :
qualiferUsage()
# assign approperiate Index crossponding to CallDictionary
if opt == '-c':
callIndex = 'gMCBIP'
elif opt == '-2':
callIndex = 'gMSCBIP'
elif opt == '-r':
callIndex = 'gMCRCWIP'
elif opt == '-F':
callIndex = 'gMCERCWIP'
elif opt == '-t':
callIndex = 'gMBSTWIP'
def main(argv):
log = None
db = os.path.dirname(os.path.abspath(__file__)) + '/log.db'
limit = None
hours = None
days = None
tmpLogFile = None
global callIndex
global callDictionary
# get the commandline args
try:
opts, args = getopt.getopt(argv, "n:h:d:c2rFt", []) # pass the comma separate list of long argument switches if required
# unsupported arg supplied
except getopt.GetoptError:
qualiferUsage()
for opt, arg in opts:
if opt == '-n':
limit = arg
elif opt == '-h':
hours = arg
elif opt == '-d':
days = arg
elif opt in ('-c', '-2', '-r', '-F', '-t'):
setCallIndex(opt)
# user either didn't specify any of the output qualifiers or specified day as well as hour
if callIndex == None or (hours != None and days != None) :
qualiferUsage()
# get the log file name specified at the end
log = ''.join(args)
# if log file name is empty, complain about it
if log.strip() == '':
qualiferUsage()
# just some debugging information
#print '\tSource: ' + str(log)
#print '\tLimit: ' + str(limit)
#print '\tHours: ' + str(hours)
#print '\tDays: ' + str(days)
#print '\tcallIndex: ' + str(callIndex)
#print '\tlog: ' + str(log)
# if user specific -, read from stdin
if log.strip() == '-':
# reading data from user and saving in a temp file
userInputData = sys.stdin.read()
print
tmpLogFile = tempfile.NamedTemporaryFile()
log = tmpLogFile.name
tmpLogFile.write(userInputData)
# start the real work here
p = parser(log, db)
p.fire()
# making sure that tmp file gets deleted at the end
if tmpLogFile != None :
tmpLogFile.close()
# send args to main, don't send first e.g. index 0 as thats script name
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment