Skip to content

Instantly share code, notes, and snippets.

@0x4E0x650x6F
Last active June 30, 2018 17:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 0x4E0x650x6F/870a83e4068425b97b61664d6acfce30 to your computer and use it in GitHub Desktop.
Save 0x4E0x650x6F/870a83e4068425b97b61664d6acfce30 to your computer and use it in GitHub Desktop.
import re
import unittest
class LogParser(object):
"""
Some the regular expressions are 'based' on Django's
Validators
"""
IPV4_RE = (r"(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)"
r"(?:\.(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3}"
)
IPV6_RE = r"\[[0-9a-f:\.]+\]"
DOMAIN_RE = (r"[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?"
r"(?:\.(?!-)[a-z' + ul + r'0-9-]{1,63}(?<!-))*"
r"\."
r"(?!-)"
r"(?:[a-z-]{2,63}"
r"|xn--[a-z0-9]{1,59})"
r"(?<!-)"
r"\.?"
)
SERVER_RE = r"(?P<host>" + IPV4_RE + "|" + IPV6_RE + "|" + DOMAIN_RE + ")"
URL_RE = (SERVER_RE +
r"(?::\d{2,5})?"
r"(?:(?P<path>[/][^\s]{,})?[?#](?P<query>[^\s]*))?"
)
SEPARATOR_RE = r"[\s|\t]{1,}"
DATE_RE = r"[a-zA-Z0-9/\s\t:+]{1,}"
LOG_RE = (r"^(?P<ip>" + IPV4_RE + ")"
r"[\s|\t][-][\s|\t][-][\s|\t]"
r"\[(?P<date>" + DATE_RE + ")\]"
+ SEPARATOR_RE +
r"\"(?P<method>[A-Z]{3,7})"
+ SEPARATOR_RE +
r"/(?P<url>" + URL_RE + ")"
+ SEPARATOR_RE +
r"(?P<code>HTTP/1\.[0-1])\""
+ SEPARATOR_RE +
r"(?P<status>[0-9]{3})"
+ SEPARATOR_RE +
r"(?P<bytes>[0-9]{1,})"
)
def __init__(self):
self.log_pattern = re.compile(LogParser.LOG_RE)
def parse(self, line):
paths = []
match = re.match(self.log_pattern, line)
found = match.groupdict()
path = found.get("path")
if path:
paths = filter(None, path.split("/"))
found.update(paths=paths)
found.update(raw=line)
return found
class TestStringParsing(unittest.TestCase):
def test_parsing(self):
log_parser = LogParser()
log = log_parser.parse('11.22.33.44 - - [27/Jun/2018:01:19:44 +0000] "POST /api-lb.babel.photobox.com/babel/a/a?d=f&a=42 HTTP/1.1" 200 1160')
print "=============== TEST 1 ================\n"
for name, value in log.iteritems():
print "%s\t:%s" % (name, value)
print "\n=============== TEST 2 ================\n"
log = log_parser.parse(
'11.22.33.44 - - [27/Jun/2018:01:19:44 +0000] "POST /11.22.33.44?d=f&a=42 HTTP/1.1" 401 1160')
for name, value in log.iteritems():
print "%s\t:%s" % (name, value)
if __name__ == '__main__':
unittest.main()
@0x4E0x650x6F
Copy link
Author

output:

=============== TEST 1 ================

status :200
paths :['babel', 'a', 'a']
code :HTTP/1.1
url :api-lb.babel.photobox.com/babel/a/a?d=f&a=42
ip :11.22.33.44
bytes :1160
raw :11.22.33.44 - - [27/Jun/2018:01:19:44 +0000] "POST /api-lb.babel.photobox.com/babel/a/a?d=f&a=42 HTTP/1.1" 200 1160
host :api-lb.babel.photobox.com
path :/babel/a/a
date :27/Jun/2018:01:19:44 +0000
query :d=f&a=42
method :POST

=============== TEST 2 ================

status :401
paths :[]
code :HTTP/1.1
url :11.22.33.44?d=f&a=42
ip :11.22.33.44
bytes :1160
raw :11.22.33.44 - - [27/Jun/2018:01:19:44 +0000] "POST /11.22.33.44?d=f&a=42 HTTP/1.1" 401 1160
host :11.22.33.44
path :None
date :27/Jun/2018:01:19:44 +0000
query :d=f&a=42
method :POST

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment