Skip to content

Instantly share code, notes, and snippets.

@francisco-ltech
Created May 15, 2019 07:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save francisco-ltech/db580f19482f2542ce620b871a7e265d to your computer and use it in GitHub Desktop.
Save francisco-ltech/db580f19482f2542ce620b871a7e265d to your computer and use it in GitHub Desktop.
Search for patterns in log files
#check the las file on the result of the parseline function
#return (logline, 0) for unsucces
#or
#return (logline, 1) for success
%python
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
import re
from pyspark.sql import Row
def parse(line):
match = re.search(APACHE_ACCESS_LOG_PATTERN, line)
if match is None:
return (logline, 0)
size_field = match.group(9)
if size_field == '-':
size = int(0)
else:
size = int(match.group(9))
return (Row(
host = match.group(1),
client_identd = match.group(2),
user_id = match.group(3),
date_time = match.group(4),
method = match.group(5),
endpoint = match.group(6),
protocol = match.group(7),
response_code = int(match.group(8)),
content_size = size
), 1)
#This function is based on the previous , access_logs will give you the right ones filter(lambda s: s[1] == 1) and filter(lambda s: s[1] == 0 on failed_logs will you the wrong one
def parseLogs():
""" Read and parse log file """
parsed_logs = (sc
.textFile(logFile)
.map(parse)
.cache())
access_logs = (parsed_logs
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0]))
failed_logs = (parsed_logs
.filter(lambda s: s[1] == 0)
.map(lambda s: s[0]))
return parsed_logs, access_logs, failed_logs
# And finally the call
parsed_logs, access_logs, failed_logs = parseLogs()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment