Skip to content

Instantly share code, notes, and snippets.

@danbirken
Created October 18, 2013 20:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danbirken/7047504 to your computer and use it in GitHub Desktop.
Save danbirken/7047504 to your computer and use it in GitHub Desktop.
import collections
import csv
import re
import subprocess
import sys
USER_AGENT_RE = re.compile(
'^([0-9\.]+) .* (206|301|304|200|404|416|500) '
'([0-9\-]+) \".*\" \"(.*)\"$'
)
IP_TO_VALID = {}
def validate_ip(ip, reverse_dns_ending, known_good_prefix=None):
if ip not in IP_TO_VALID:
if known_good_prefix and ip.startswith(known_good_prefix):
return True
output = subprocess.Popen(['host', ip], stdout=subprocess.PIPE).communicate()
reverse = output[0].strip().split()[-1]
if not reverse.endswith(reverse_dns_ending):
IP_TO_VALID[ip] = False
return False
output = subprocess.Popen(['host', reverse], stdout=subprocess.PIPE).communicate()
IP_TO_VALID[ip] = output[0].strip().split()[-1] == ip
return IP_TO_VALID[ip]
user_agent_to_bytes = collections.defaultdict(list)
for line in open('bus_logs'):
line = line.strip()
matches = USER_AGENT_RE.match(line)
assert matches, line
ip, response, request_bytes, user_agent = matches.groups()
if response == '200':
if request_bytes != '-':
if 'bingbot' in user_agent:
if not validate_ip(ip, '.search.msn.com.', '157.5'):
user_agent = 'SPOOF, %s' % user_agent
if 'www.google.com/bot.html' in user_agent:
if not validate_ip(ip, '.googlebot.com.'):
user_agent = 'SPOOF, %s' % user_agent
user_agent_to_bytes[user_agent].append(int(request_bytes))
user_agent_to_bytes['TOTAL'].append(int(request_bytes))
user_agent_to_totals = {}
for user_agent, reqs in user_agent_to_bytes.iteritems():
user_agent_to_totals[user_agent] = (len(reqs), sum(reqs))
writer = csv.writer(sys.stdout)
writer.writerow(['requests', 'bytes', 'user agent'])
for user_agent, stats in sorted(
user_agent_to_totals.iteritems(), key=lambda a: a[1], reverse=True):
writer.writerow([stats[0], stats[1], user_agent])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment