Skip to content

Instantly share code, notes, and snippets.

@discobean
Created October 24, 2016 01:18
Show Gist options
  • Save discobean/62cc6a89167d80822add34ee7d0adb29 to your computer and use it in GitHub Desktop.
Save discobean/62cc6a89167d80822add34ee7d0adb29 to your computer and use it in GitHub Desktop.
Extracts nginx hostname from nginx log files
#!/usr/bin/python
import re, sys, fileinput
class NginxLogParser:
"""
This class implements a nginx log parser. After feeding a nginx log format, it generates a regex for
that specify format. Then we use this regex to extract useful information from nginx log.
"""
format_directive = r'(\S)?\$([\w_]+)(\S)?'
def __init__(self,formatter):
self.parser = formatter
self.directive = {}
i = 0
for re_matched in re.finditer(self.format_directive,formatter):
left,variable,right = re_matched.groups()
self.directive[variable] = i
i = i + 1
if left:
left = self.escape(left)
else:
left = ''
if right:
right = self.escape(right)
else:
right = ''
if left or right:
if left != right:
regex = left + '([^' +left + right + ']+)' + right
else:
regex = left + '([^' + right + ']+)' + right
else:
regex = r'([^\s]+)'
self.parser = self.parser.replace(re_matched.group(0),regex,1)
self.regex = self.parser #+ '$'
self.regex = self.regex.replace(' ', '\\s+')
#print self.regex
self.result = self.directive.copy()
def escape(self,string):
return re.sub(r'[.*+?|()\[\]{}]',r'\\\g<0>',string)
def parse_line(self,line):
find_result = re.findall(self.regex,line)
if find_result:
for k in self.result:
self.result[k] = find_result[0][self.directive[k]]
return self.result
else:
return {}
logformat = '$remote_addr $request_time $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $request_length $bytes_sent "http_x_forwarded_for" "$http_host" "$upstream_addr" "$http_origin"'
parser = NginxLogParser(logformat)
hosts = {}
for f in fileinput.input():
parsed = parser.parse_line(f)
try:
if not parsed.get('http_host'):
parsed['http_host'] = 'None'
hosts[parsed['http_host']] += 1
except:
hosts[parsed['http_host']] = 1
for host, count in hosts.iteritems():
print "%30s %d" % (host, count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment