Skip to content

Instantly share code, notes, and snippets.

@cbare
Created October 6, 2011 23:27
Show Gist options
  • Save cbare/1268998 to your computer and use it in GitHub Desktop.
Save cbare/1268998 to your computer and use it in GitHub Desktop.
A customized script (aka hack) to parse apache server logs and count stuff
## Parse gaggle apache server logs to compile usage stats
## ...with thanks to: https://github.com/lethain/apache-log-parser
## Track number of accesses by IP address, accesses to Java Web Starts (.jnlp files)
## and subversion access.
import sys
import re
import subprocess
import argparse
class Counter:
"""A dictionary that keeps a counter for each key."""
def __init__(self):
self.counts = {}
def incr(self, key):
"""Increment the counter"""
if key in self.counts:
self.counts[key] += 1
else:
self.counts[key] = 1
return self.counts[key]
def get(self, key):
"""Get the value of the counter for a key or 0 for a key we've never seen before"""
if key in self.counts:
return self.counts[key]
else:
return 0
# a nasty regex to parse a line of apache server log
log_re = re.compile(r'(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) [-\w\d"\\]+ [-\w\d"\\]+\s+\[(?P<time>.*?)\] "(?P<cmd>.*?) (?P<uri>.*?) HTTP/\d.\d" (?P<status>\d+).*')
def read_log(filename, whois_flag, show_ip_cutoff=-1):
jnlp = 0
svn = 0
internal_jnlp = 0
internal_svn = 0
ip_counter = Counter()
with open(filename, 'r') as f:
for line in f:
m = log_re.match(line)
if not m:
print "???:" + line
continue
ip = m.group('ip')
uri = m.group('uri')
time = m.group('time')
status = m.group('status')
# throw out search engine traffic
if ip.startswith("66.249.68."): continue # google
if ip.startswith("66.249.67."): continue # google
if ip.startswith("66.249.72."): continue # google
if ip.startswith("67.195."): continue # yahoo
if ip.startswith("207.46."): continue # msft
if ip.startswith("65.52."): continue # msft
if ip.startswith("65.53."): continue # msft
if ip.startswith("65.54."): continue # msft
if ip.startswith("65.55."): continue # msft
if ip.startswith("157.54."): continue # msft
if ip.startswith("157.55."): continue # msft
if ip.startswith("157.56."): continue # msft
if ip.startswith("157.57."): continue # msft
if ip.startswith("157.58."): continue # msft
if ip.startswith("157.59."): continue # msft
if ip.startswith("157.60."): continue # msft
if ip.startswith("208.115.111."): continue # dotnetdotcom.org
# spider75.yandex.ru
if ip.startswith("87.250.252.") or ip.startswith("95.108.158.") or ip.startswith("87.250.254."): continue
# internal traffic
if ip.startswith("10.10") or ip.startswith('10.0.'):
if uri.endswith(".jnlp"):
internal_jnlp += 1
if uri.startswith("/svn/gaggle"):
internal_svn += 1
ip_counter.incr("10.x.x.x")
continue
# count visits from each unique IP
ip_counter.incr(ip)
# count access to .jnlp's (java webstart launch)
if uri.endswith(".jnlp"):
jnlp += 1
# count accesses to SVN from outside ISB
if uri.startswith("/svn/gaggle"):
svn += 1
# sort IPs with the most hits to the top
sorted_ips = sorted(ip_counter.counts.keys(), key=lambda k: ip_counter.counts[k], reverse=True)
if show_ip_cutoff > -1:
for ip in sorted_ips:
if ip_counter.counts[ip] > show_ip_cutoff:
if whois_flag:
org = whois(ip)
print "%s: %d (%s)" % (ip, ip_counter.counts[ip], org)
else:
print "%s: %d" % (ip, ip_counter.counts[ip])
print "-" * 90
print "unique IPs: %d" % (len(ip_counter.counts))
print "jnlp accesses: %d" % (jnlp)
print "svn accesses: %d" % (svn)
print "internal accesses: %d" % (ip_counter.get('10.x.x.x'))
print "internal jnlp accesses: %d" % (internal_jnlp)
print "internal svn accesses: %d" % (internal_svn)
# this just wraps the command line whois utility and greps for "orgname:"
# this only works about 1/2 the time
# there's also a pywhois library http://code.google.com/p/pywhois/
def whois(ip):
p1 = subprocess.Popen(['whois', ip], stdout=subprocess.PIPE)
p2 = subprocess.Popen(['grep', '-i', 'orgname:'], stdin=p1.stdout, stdout=subprocess.PIPE)
p1.stdout.close()
return p2.communicate()[0].strip()
def main():
parser = argparse.ArgumentParser(description='Process an apache log file and generate some usage statistics.')
parser.add_argument('filename', metavar='FILENAME', help='an apache log file')
parser.add_argument('--whois', action='store_true', default=False, help='run whois on high-usage IP addresses')
parser.add_argument('--show-ip-cutoff', '-c', metavar='CUTOFF', type=int, default=-1, help='cutoff for showing high-usage IP addresses')
args = parser.parse_args()
print "Reading log file(s) at: " + args.filename
read_log(args.filename, args.whois, args.show_ip_cutoff)
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment