Skip to content

Instantly share code, notes, and snippets.

@rhamaa
Created May 13, 2021 14:06
Show Gist options
  • Save rhamaa/4443da506885b36eb64e521a10a14244 to your computer and use it in GitHub Desktop.
Save rhamaa/4443da506885b36eb64e521a10a14244 to your computer and use it in GitHub Desktop.
read apache logs using pandas
# Source : https://mmas.github.io/read-apache-access-log-pandas
# https://mmas.github.io/analyze-apache-access-log-pandas
import re
import pandas as pd
from datetime import datetime
import pytz
from IPython import embed
def parse_str(x):
"""
Returns the string delimited by two characters.
Example:
`>>> parse_str('[my string]')`
`'my string'`
"""
if x is None:
print("X : ", x)
return "AAAAAAAAAAAAAAA"
return x[1:-1]
def parse_datetime(x):
'''
Parses datetime with timezone formatted as:
`[day/month/year:hour:minute:second zone]`
Example:
`>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
`datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`
Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the
timezone will be obtained using the `pytz` library.
'''
try:
dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
dt_tz = int(x[-6:-3])*60+int(x[-3:-1])
return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
except:
x = "[02/May/2021:03:20:40 +0700]"
dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
dt_tz = int(x[-6:-3])*60+int(x[-3:-1])
return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
def parse_int(x):
if x.isnumeric():
return int(x)
return x
data = pd.read_csv(
'logs-example/pasarsedekah_access.log-20210503',
sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
engine='python',
na_values='-',
header=None,
usecols=[0, 3, 4, 5, 6, 7, 8],
names=['ip', 'time', 'request', 'status', 'size', 'referer', 'user_agent'],
converters={'time': parse_datetime,
'request': parse_str,
'status': parse_int,
'size': parse_int,
'referer': parse_str,
'user_agent': parse_str})
embed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment