Created
October 24, 2016 01:18
-
-
Save discobean/62cc6a89167d80822add34ee7d0adb29 to your computer and use it in GitHub Desktop.
Extracts nginx hostname from nginx log files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re, sys, fileinput | |
class NginxLogParser: | |
""" | |
This class implements a nginx log parser. After feeding a nginx log format, it generates a regex for | |
that specify format. Then we use this regex to extract useful information from nginx log. | |
""" | |
format_directive = r'(\S)?\$([\w_]+)(\S)?' | |
def __init__(self,formatter): | |
self.parser = formatter | |
self.directive = {} | |
i = 0 | |
for re_matched in re.finditer(self.format_directive,formatter): | |
left,variable,right = re_matched.groups() | |
self.directive[variable] = i | |
i = i + 1 | |
if left: | |
left = self.escape(left) | |
else: | |
left = '' | |
if right: | |
right = self.escape(right) | |
else: | |
right = '' | |
if left or right: | |
if left != right: | |
regex = left + '([^' +left + right + ']+)' + right | |
else: | |
regex = left + '([^' + right + ']+)' + right | |
else: | |
regex = r'([^\s]+)' | |
self.parser = self.parser.replace(re_matched.group(0),regex,1) | |
self.regex = self.parser #+ '$' | |
self.regex = self.regex.replace(' ', '\\s+') | |
#print self.regex | |
self.result = self.directive.copy() | |
def escape(self,string): | |
return re.sub(r'[.*+?|()\[\]{}]',r'\\\g<0>',string) | |
def parse_line(self,line): | |
find_result = re.findall(self.regex,line) | |
if find_result: | |
for k in self.result: | |
self.result[k] = find_result[0][self.directive[k]] | |
return self.result | |
else: | |
return {} | |
logformat = '$remote_addr $request_time $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $request_length $bytes_sent "http_x_forwarded_for" "$http_host" "$upstream_addr" "$http_origin"' | |
parser = NginxLogParser(logformat) | |
hosts = {} | |
for f in fileinput.input(): | |
parsed = parser.parse_line(f) | |
try: | |
if not parsed.get('http_host'): | |
parsed['http_host'] = 'None' | |
hosts[parsed['http_host']] += 1 | |
except: | |
hosts[parsed['http_host']] = 1 | |
for host, count in hosts.iteritems(): | |
print "%30s %d" % (host, count) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment