Created
May 15, 2019 07:27
-
-
Save francisco-ltech/db580f19482f2542ce620b871a7e265d to your computer and use it in GitHub Desktop.
Search for patterns in log files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#check the las file on the result of the parseline function | |
#return (logline, 0) for unsucces | |
#or | |
#return (logline, 1) for success | |
%python | |
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)' | |
import re | |
from pyspark.sql import Row | |
def parse(line): | |
match = re.search(APACHE_ACCESS_LOG_PATTERN, line) | |
if match is None: | |
return (logline, 0) | |
size_field = match.group(9) | |
if size_field == '-': | |
size = int(0) | |
else: | |
size = int(match.group(9)) | |
return (Row( | |
host = match.group(1), | |
client_identd = match.group(2), | |
user_id = match.group(3), | |
date_time = match.group(4), | |
method = match.group(5), | |
endpoint = match.group(6), | |
protocol = match.group(7), | |
response_code = int(match.group(8)), | |
content_size = size | |
), 1) | |
#This function is based on the previous , access_logs will give you the right ones filter(lambda s: s[1] == 1) and filter(lambda s: s[1] == 0 on failed_logs will you the wrong one | |
def parseLogs(): | |
""" Read and parse log file """ | |
parsed_logs = (sc | |
.textFile(logFile) | |
.map(parse) | |
.cache()) | |
access_logs = (parsed_logs | |
.filter(lambda s: s[1] == 1) | |
.map(lambda s: s[0])) | |
failed_logs = (parsed_logs | |
.filter(lambda s: s[1] == 0) | |
.map(lambda s: s[0])) | |
return parsed_logs, access_logs, failed_logs | |
# And finally the call | |
parsed_logs, access_logs, failed_logs = parseLogs() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment