Skip to content

Instantly share code, notes, and snippets.

@marsam
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marsam/df4b02f92644afbee2b1 to your computer and use it in GitHub Desktop.
Save marsam/df4b02f92644afbee2b1 to your computer and use it in GitHub Desktop.
topvisits: solution of maxmind dev-hire-homework.
*.gz
*.log
*.mmdb
from setuptools import setup
setup(
name='topvisits',
version='0.0.1',
license='MIT',
py_modules=['topvisits'],
include_package_data=True,
install_requires=[
'Click',
'geoip2',
],
tests_require=[
'pytest',
],
entry_points="""
[console_scripts]
topvisits=topvisits:main
""",
)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# flake8: noqa
import pytest
import topvisits
ip_expectations = [
['183.60.212.148 - - [26/Aug/2014:06:26:39 -0600] "GET /entry/15205 HTTP/1.1" 200 4865 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"',
('183.60.212.148', 'GET /entry/15205 HTTP/1.1')],
['65.55.219.79 - - [26/Aug/2014:06:26:57 -0600] "GET /robots.txt HTTP/1.1" 301 178 "-" "msnbot-UDiscovery/2.0b (+http://search.msn.com/msnbot.htm)"',
('65.55.219.79', 'GET /robots.txt HTTP/1.1')],
['37.58.100.142 - - [26/Aug/2014:06:27:08 -0600] "GET /entry/near/0%2C0/filter?unit=mile;distance=25;sort_order=ASC;page=;order_by=distance;address=34034;limit= HTTP/1.1" 200 6192 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)"',
('37.58.100.142', 'GET /entry/near/0%2C0/filter?unit=mile;distance=25;sort_order=ASC;page=;order_by=distance;address=34034;limit= HTTP/1.1')],
['104.131.236.236 - mailto [26/Aug/2014:10:40:21 -0600] "GET / HTTP/1.1" 301 178 "-" "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"',
('104.131.236.236', 'GET / HTTP/1.1')],
]
@pytest.mark.parametrize("line, expected", ip_expectations)
def test_ip_reqline(line, expected):
assert topvisits.ip_reqline(line) == expected
ignorable_expectations = [
('/favicon.ico', True),
('/entry-images/4372/4372-2495-small.jpg', True),
]
@pytest.mark.parametrize("path, ignorable", ignorable_expectations)
def test_is_ignorable(path, ignorable):
assert topvisits.is_ignorable(path) == ignorable
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
topvisits
=========
Given a logfile with the Apache `combined log format`_ return the top countries for visitors.
Solution of maxmind `dev-hire-homework`_.
.. _combined log format: https://httpd.apache.org/docs/1.3/logs.html#combined
.. _dev-hire-homework: https://github.com/maxmind/dev-hire-homework/
"""
from __future__ import print_function
import re
from collections import Counter
import click
import geoip2.database
REGEXP = re.compile(r'(?P<ip>[(\d\.)]+) - (?:[\w-]+) \[(?:.*?)\] "(?P<req>.*?)"')
IGNORE_PATH_REGEXP = map(re.compile, [
r'^/[a-f0-9]+/css/',
r'^/[a-f0-9]+/images/',
r'^/[a-f0-9]+/js/',
r'^/entry-images/',
r'^/images/',
r'^/user-images/',
r'^/static/',
r'^/robots.txt',
r'^/favicon.ico',
r'.*\.atom',
r'.*\.rss',
])
def ip_reqline(line):
"Return a tuple (ip, request-line) from a log line."
match = re.match(REGEXP, line)
if match is not None:
return match.group('ip'), match.group('req')
def is_ignorable(path):
for regexp in IGNORE_PATH_REGEXP:
if re.match(regexp, path):
return True
else:
return False
def country_from_ip(reader, ip):
resp = reader.country(ip)
return resp.country.name
@click.command(help=__doc__)
@click.option('--mmdb', default='GeoLite2-Country.mmdb', type=click.Path(exists=True, dir_okay=False), help='Path to the MaxMind database.')
@click.option('--top', default=10, type=click.INT, help='Number of top countries.')
@click.argument('logfile', type=click.File('rb'))
def main(mmdb, top, logfile):
reader = geoip2.database.Reader(mmdb)
ipcounter = Counter()
for line in logfile:
if not line:
continue
ip, reqline = ip_reqline(line)
_, path, _ = reqline.split(None, 2)
if is_ignorable(path):
continue
country = country_from_ip(reader, ip) or 'Unknown'
ipcounter.update({country: 1})
click.echo(click.style('Top {0} countries for visitors'.format(top), fg='green'))
for country, count in ipcounter.most_common(top):
click.echo('{0}: {1}'.format(country, count))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment