Last active March 23, 2016 16:39
You can use this python script (plus pandas) to analyze file download counts
#!/usr/bin/env python
from __future__ import print_function
import os
import xml.etree.ElementTree as ET
from multiprocessing.dummy import Pool
import boto3
import requests
import pandas as pd
BUCKET = 'mah-bukkit'
PREFIX = 'podcast_logs'
PWD = os.path.dirname(__file__)
LOG_DIR = os.path.abspath(os.path.join(PWD, 'podcast_logs'))
PODCAST_DF = os.path.abspath(os.path.join(PWD, 'podcast_logs.csv'))
GEOIP_DF = os.path.abspath(os.path.join(PWD, 'geoip.csv'))
POOL_SIZE = 20 # 20 threads in the pool. Tweak this as you see fit.
def get_object(work_tuple):
(s3, key) = work_tuple
obj = s3.get_object(Bucket=BUCKET, Key=key)
return (key, obj)
def download_logs():
Downloads logs from S3 using Boto.
s3 = boto3.client('s3')
paginator = s3.get_paginator('list_objects')
pool = Pool(POOL_SIZE)
results = pool.imap_unordered(get_object,
[(s3, obj['Key'])
for chunk in paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
for obj in chunk['Contents']])
for result in results:
yield result
def create_dataframe():
cols = ['BucketOwner', 'Bucket', 'Time', 'RemoteIP', 'Requester', 'RequestID',
'Operation', 'Key', 'RequestURI', 'HTTPStatus', 'ErrorCode', 'BytesSent',
'ObjectSize', 'TotalTime', 'TurnAroundTime', 'Referrer', 'UserAgent', 'VersionID']
filelist = os.listdir(LOG_DIR)
df_list = [pd.read_table(os.path.join(LOG_DIR, log_file), sep=' ', header=None, index_col=False, names=cols) for log_file in filelist]
df = pd.concat(df_list)
return df
def extract_lat_long(ip):
response = requests.get(GEOIP_LOOKUP + ip)
if response.status_code == 200:
result_xml = ET.fromstring(response.content.replace("&", "&"))
lat = float(result_xml.find('./result/latitude').text)
lng = float(result_xml.find('./result/longitude').text)
return (lat, lng)
def geocode_ip_addresses(ips):
pool = Pool(POOL_SIZE)
results = pool.imap_unordered(extract_lat_long,
[ip for ip in ips])
for result in results:
yield result
def build_geoip_map(df, rebuild_df=False):
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
if not os.path.exists(GEOIP_DF) or rebuild_df:
raw_data = {'latitude': [], 'longitude': []}
for lat, lng in geocode_ip_addresses(df.RemoteIP):
geodf = pd.DataFrame(raw_data)
geodf = pd.read_csv(GEOIP_DF)
# Create a map, using the Gall Peters projection
bmap = Basemap(projection='gall',
# with low resolution
resolution = 'l',
# And threshold 100000
area_thresh = 100000.0,
# Centered at 0,0 (i.e null island)
lat_0=0, lon_0=0)
# Draw the coastlines on the map
# Draw country borders on the map
# Fill the land with grey
bmap.fillcontinents(color = '#888888')
# Draw the map boundaries
# Define our longitude and latitude points
# We have to use .values because of a wierd bug when passing pandas data
# to basemap.
x,y = bmap(geodf['longitude'].values, geodf['latitude'].values)
# Plot them using round markers of size 6
bmap.plot(x, y, 'ro', markersize=6)
# Show the map
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--download-logs', '-d', dest='download_logs', action='store_true',
help='Download S3 access logs and create/update podcast dataframe.')
parser.add_argument('--show-geoip-map', dest='show_geoip_map', action='store_true',
help='Build a map using Geocoded IP addresses of all the downloaded files.')
parser.add_argument('--check-missing-files', dest='check_missing', action='store_true',
help='Check to see if any files that have been requested responded with 404 not found.')
args = parser.parse_args()
log_dir_exists = os.path.exists(LOG_DIR)
if args.download_logs or not log_dir_exists:
args.download_logs = True
if not log_dir_exists:
for key, s3_obj in download_logs():
with open(key, 'w') as log_file:
log_contents = s3_obj['Body'].read()
# This is to replace the timestamp brackets with quotes for
# the panda's csv reader to understand
log_contents = log_contents.replace('[', '"').replace(']', '"')
dataframe_exists = os.path.exists(PODCAST_DF)
if not dataframe_exists or args.download_logs:
df = create_dataframe()
df = pd.read_csv(PODCAST_DF)
# Limit the dataframe to download operations only
ddf = df[df.Operation == 'WEBSITE.GET.OBJECT']
# Include only our actual podcast files
ddf = ddf[ddf.Key.str.contains('m4a')]
if args.check_missing:
print('======= MISSING Podcast Download Files =======\n')
print(ddf[ddf.HTTPStatus == 404]['Key'].value_counts())
# Exclude any missing audio file links
ddf = ddf[ddf.HTTPStatus != 404]
# Display download totals for the files
total_download_series = ddf['Key'].value_counts()
print('======= Podcast Download Totals =======\n')
# Come up with simple UserAgent to give a basic idea of different agents
ddf['SplitUA'] = ddf['UserAgent'].str.split('/').str.get(0).apply(pd.Series, 1)
splitua_series = ddf['SplitUA'].value_counts()
# Display basic UserAgent breakdown
print('======= Downloads by UserAgent =======\n')
if args.show_geoip_map:
print("Generating geoip map based on downloads...")
build_geoip_map(ddf, rebuild_df=args.download_logs)
If you want to generate the geoip map, you'll need matplotlib and basemap installed.

