Skip to content

Instantly share code, notes, and snippets.

@ksn-developer
Last active April 10, 2023 10:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ksn-developer/4072a9e092bccf68559c21f1c5ac2de2 to your computer and use it in GitHub Desktop.
Save ksn-developer/4072a9e092bccf68559c21f1c5ac2de2 to your computer and use it in GitHub Desktop.
import re
import shlex
import pandas as pd
class Parser:
IP = 0
TIME = 3
TIME_ZONE = 4
REQUESTED_URL = 5
STATUS_CODE = 6
USER_AGENT = 9
def parse_line(self, line):
try:
line = re.sub(r"[\[\]]", "", line)
data = shlex.split(line)
result = {
"ip": data[self.IP],
"time": data[self.TIME],
"status_code": data[self.STATUS_CODE],
"requested_url": data[self.REQUESTED_URL],
"user_agent": data[self.USER_AGENT],
}
return result
except Exception as e:
raise e
if __name__ == '__main__':
parser = Parser()
LOGFILE = "access.log"
with open(LOG_FILE, "r") as f:
log_entries = [parser.parse_line(line) for line in f]
logs_df = pd.DataFrame(log_entries)
print(logs_df.head())
#All requests with status code 404
logs_df.loc[(logs_df["status code"] == "404")]
#Requests from unique ip addresses
logs_df["ip"].unique()
#Get all distinct user agents
logs_df["user_agent"].unique()
#Get most requested urls
logs_df["requested_url"].value_counts().to_dict()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment