Skip to content

Instantly share code, notes, and snippets.

@pmeyerson
Last active November 14, 2017 22:05
Show Gist options
  • Save pmeyerson/fefea9c608bf2de37661e7a0a49f0ebc to your computer and use it in GitHub Desktop.
Save pmeyerson/fefea9c608bf2de37661e7a0a49f0ebc to your computer and use it in GitHub Desktop.
parse exchange 2010 smtp receive connector logs into csv format
#!/usr/bin/python
import os
import csv
import sys
logdir="exhub_logs/"
outdir="output/"
# get list of input log files
#file format is: #Fields: date-time,connector-id,session-id,sequence-number,local-endpoint,remote-endpoint,event,data,context
# note that CONTEXT field can contain additional commas ','
# Event = "-" -> session ends; "+"-> session start, ">" or "<" echo of received or sent
# "*" -> explanatory log output (?)
input_headers=["date-time","connector-id","session-id","sequence-number","local-endpoint","remote-endpoint","event","data"]
output_headers=["date-time","connector-id","session-id","local-endpoint","remote-endpoint","data"]
filelist = []
eventlist={}
cleanevents=[]
partialevents={} #for session data spanning two input files
cleanchars=['\'','[',']']
throwout = {} #track incomplete sessions
session_count =0
partialevents.clear()
for root, dirs, files in os.walk(logdir):
for filename in sorted(files):
filelist.append(filename)
print ("found " + str(len(filelist)) + " log files to parse\n")
for file in filelist:
#
print("opening file "+ file)
with open(logdir+file) as openfile:
for ln in openfile:
# parse one line at a time
if (("#Software:" in ln) or ("#Version:" in ln) or ("#Log-type:" in ln) or ("#Date:" in ln) or ("#Fields:" in ln)):
continue #skip these lines
else:
# grab first 7 fields via comma delim
# print("line starts with "+ln[0][::2])
# remove any quotes
for ch in ['\r\n','\"']:
if ch in ln:
ln=ln.replace(ch,'')
try:
line_parts=ln.split(',')
if len(line_parts) > 5: #ensure malformed data does not bork script!
session_id=line_parts[2]
if '+' in line_parts[6]:
# First sequence for new session
partialevents[session_id] = line_parts[0:7]
elif '-' in line_parts[6]:
# closing sequence in session, move to eventlist dict IF present
if session_id in partialevents:
eventlist[session_id]=partialevents[session_id]
del partialevents[session_id]
eventlist[session_id].append(line_parts[6::])
else:
partialevents[session_id]=line_parts
elif ">" in line_parts[6] or "<" in line_parts[6]:
# append sequence to existing session if present
if session_id in partialevents:
partialevents[session_id].append(line_parts[6::])
elif len(line_parts) < 3:
#print("data seems malformed, and no session ID " + ln)
print("no session id? + " + str(filename) + ln)
if "no_session_id" in throwout:
throwout['no_session_id'].append(ln)
else:
throwout['no_session_id'] =[ln]
continue
else:
#if len(line_parts) !<3... (no session_ID)
print("no session id")
print(ln+'\n')
except:
exc_type, exc_obj, exc_tb = sys.exc_info()
print("error " + str(exc_type) + " at line " + str(exc_tb.tb_lineno))
print(ln)
exit(1)
print("finished parsing")
# at this point, we need to convert from a single csv string followed by a list, to a single csv string.
# also remove some punt and replace others with a single space to neaten up
#for sequence_id in eventlist.keys():
# event_start = eventlist[sequence_id][0].strip(',')+','
# txt = str(eventlist[sequence_id][1::])
#for ch in cleanchars:
# if ch in txt:
# txt = txt.replace(ch, '')
#if ',' in txt:
# txt = txt.replace(',', ' ')
#if ' ' in txt:
# txt = txt.replace(' ', ' ')
#event_last = txt
#cleanevents.append(event_start+event_last)
print("\ngetting ready to write file\n")
with open(outdir+file+'.csv','w') as csvfile:
writer = csv.writer(csvfile, lineterminator='\n')
writer.writerow(output_headers)
for sequence_id in eventlist:
csvfile.write(sequence_id+'\n')
#csvfile.write(sequence_id[1])
# Cleanup data structures to prepare for next file parse
session_count += len(eventlist)
eventlist.clear()
cleanevents=[]
print("will start next file with "+str(len(partialevents))+" incomplete sessions")
#print eventlist #end of loop for file
# at this point we should have a dict of sequence-number/events by sequence-id
print("\nfinished wrote " + str(session_count) + " smtp sessions across all output files\n")
print("found " + str(len(partialevents)) + " incomplete sessions" )
print(partialevents)
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment