Skip to content

Instantly share code, notes, and snippets.

@8bitben
Created September 19, 2018 17:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save 8bitben/689175621c621ac4caea529363962f57 to your computer and use it in GitHub Desktop.
Save 8bitben/689175621c621ac4caea529363962f57 to your computer and use it in GitHub Desktop.
import csv
import calendar
'''
CSV to CLF log file formatter
Ben Shepherd
Go Fish Digital
August 2018
'''
with open('client-logs-formatted.log','w') as log_out:
with open('client-logs.csv') as log_csv:
reader = csv.DictReader(log_csv)
for row in reader:
#Initialize log line object
log_file_object = {}
#Grab request IP
log_file_object['request_ip'] = row['requestip']
#User identifier not relevant for this project, no authentication used
log_file_object['user_identifier'] = '-'
#Same as above
log_file_object['user_id'] = '-'
'''
Date and time are separated and incorrectly formatted in source data,
combine and reformat them here
'''
date_split = row['date'].split('-')
year = date_split[0]
month = calendar.month_abbr[int(date_split[1])]
day = date_split[2]
#Bring it all together into a CLF-style timestamp string
log_file_object['request_timestamp'] = '[{}/{}/{}:{} {}]'.format(day,month,year,row['time'],'-0000')
#Check if query string is relevant, if it is we need to reformat for CLF
if row['querystring'] != '-':
http_request = '{} {}?{}'.format(row['method'],row['uri'],row['querystring'])
elif row['querystring'] == '-':
http_request = '{} {}'.format(row['method'],row['uri'])
#HTTP Request String
log_file_object['http_request'] = http_request
#HTTP Status
log_file_object['http_status_code'] = row['status']
#Response size in bytes
log_file_object['response_size'] = '-'
#Referrer -- this is specific to Combined LF, not spec'd in Common LF
log_file_object['referrer'] = row['referrer']
#UserAgent -- this is specific to Combined LF, not spec'd in Common LF
#There is a data anomoly re: quotations, remove them
log_file_object['user_agent'] = row['useragent'].replace('%2520',' ')
#We've got everything we need! Smash it all together into one line each for the resulting log file output
combined_log_line_format = '{request_ip} {user_identifier} {user_id} {request_timestamp} "{http_request}" {http_status_code} "{referrer}" "{user_agent}"\n'.format(**log_file_object)
#debug
print(combined_log_line_format)
#write
log_out.write(combined_log_line_format)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment