Skip to content

Instantly share code, notes, and snippets.

@deybhayden
Last active March 23, 2016 16:39
Show Gist options
  • Save deybhayden/4aa33b0a6c5d8028ab6e to your computer and use it in GitHub Desktop.
Save deybhayden/4aa33b0a6c5d8028ab6e to your computer and use it in GitHub Desktop.
You can use this python script (plus pandas) to analyze file download counts
#!/usr/bin/env python
from __future__ import print_function
import os
import xml.etree.ElementTree as ET
from multiprocessing.dummy import Pool
import boto3
import requests
import pandas as pd
BUCKET = 'mah-bukkit'
PREFIX = 'podcast_logs'
PWD = os.path.dirname(__file__)
LOG_DIR = os.path.abspath(os.path.join(PWD, 'podcast_logs'))
PODCAST_DF = os.path.abspath(os.path.join(PWD, 'podcast_logs.csv'))
GEOIP_DF = os.path.abspath(os.path.join(PWD, 'geoip.csv'))
GEOIP_LOOKUP = 'http://api.geoiplookup.net/?query='
POOL_SIZE = 20 # 20 threads in the pool. Tweak this as you see fit.
def get_object(work_tuple):
(s3, key) = work_tuple
obj = s3.get_object(Bucket=BUCKET, Key=key)
return (key, obj)
def download_logs():
"""
Downloads logs from S3 using Boto.
"""
s3 = boto3.client('s3')
paginator = s3.get_paginator('list_objects')
pool = Pool(POOL_SIZE)
results = pool.imap_unordered(get_object,
[(s3, obj['Key'])
for chunk in paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
for obj in chunk['Contents']])
for result in results:
yield result
def create_dataframe():
cols = ['BucketOwner', 'Bucket', 'Time', 'RemoteIP', 'Requester', 'RequestID',
'Operation', 'Key', 'RequestURI', 'HTTPStatus', 'ErrorCode', 'BytesSent',
'ObjectSize', 'TotalTime', 'TurnAroundTime', 'Referrer', 'UserAgent', 'VersionID']
filelist = os.listdir(LOG_DIR)
df_list = [pd.read_table(os.path.join(LOG_DIR, log_file), sep=' ', header=None, index_col=False, names=cols) for log_file in filelist]
df = pd.concat(df_list)
df.to_csv(PODCAST_DF)
return df
def extract_lat_long(ip):
response = requests.get(GEOIP_LOOKUP + ip)
if response.status_code == 200:
result_xml = ET.fromstring(response.content.replace("&", "&"))
lat = float(result_xml.find('./result/latitude').text)
lng = float(result_xml.find('./result/longitude').text)
return (lat, lng)
def geocode_ip_addresses(ips):
pool = Pool(POOL_SIZE)
results = pool.imap_unordered(extract_lat_long,
[ip for ip in ips])
for result in results:
yield result
def build_geoip_map(df, rebuild_df=False):
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
if not os.path.exists(GEOIP_DF) or rebuild_df:
raw_data = {'latitude': [], 'longitude': []}
for lat, lng in geocode_ip_addresses(df.RemoteIP):
raw_data['latitude'].append(lat)
raw_data['longitude'].append(lng)
geodf = pd.DataFrame(raw_data)
geodf.to_csv(GEOIP_DF)
else:
geodf = pd.read_csv(GEOIP_DF)
plt.figure(figsize=(20,10))
# Create a map, using the Gall Peters projection
bmap = Basemap(projection='gall',
# with low resolution
resolution = 'l',
# And threshold 100000
area_thresh = 100000.0,
# Centered at 0,0 (i.e null island)
lat_0=0, lon_0=0)
# Draw the coastlines on the map
bmap.drawcoastlines()
# Draw country borders on the map
bmap.drawcountries()
# Fill the land with grey
bmap.fillcontinents(color = '#888888')
# Draw the map boundaries
bmap.drawmapboundary(fill_color='#f4f4f4')
# Define our longitude and latitude points
# We have to use .values because of a wierd bug when passing pandas data
# to basemap.
x,y = bmap(geodf['longitude'].values, geodf['latitude'].values)
# Plot them using round markers of size 6
bmap.plot(x, y, 'ro', markersize=6)
# Show the map
plt.show()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--download-logs', '-d', dest='download_logs', action='store_true',
help='Download S3 access logs and create/update podcast dataframe.')
parser.add_argument('--show-geoip-map', dest='show_geoip_map', action='store_true',
help='Build a map using Geocoded IP addresses of all the downloaded files.')
parser.add_argument('--check-missing-files', dest='check_missing', action='store_true',
help='Check to see if any files that have been requested responded with 404 not found.')
args = parser.parse_args()
log_dir_exists = os.path.exists(LOG_DIR)
if args.download_logs or not log_dir_exists:
args.download_logs = True
if not log_dir_exists:
os.mkdir(LOG_DIR)
for key, s3_obj in download_logs():
with open(key, 'w') as log_file:
log_contents = s3_obj['Body'].read()
# This is to replace the timestamp brackets with quotes for
# the panda's csv reader to understand
log_contents = log_contents.replace('[', '"').replace(']', '"')
log_file.write(log_contents)
dataframe_exists = os.path.exists(PODCAST_DF)
if not dataframe_exists or args.download_logs:
df = create_dataframe()
else:
df = pd.read_csv(PODCAST_DF)
# Limit the dataframe to download operations only
ddf = df[df.Operation == 'WEBSITE.GET.OBJECT']
# Include only our actual podcast files
ddf = ddf[ddf.Key.str.contains('m4a')]
if args.check_missing:
print('======= MISSING Podcast Download Files =======\n')
print(ddf[ddf.HTTPStatus == 404]['Key'].value_counts())
print()
# Exclude any missing audio file links
ddf = ddf[ddf.HTTPStatus != 404]
# Display download totals for the files
total_download_series = ddf['Key'].value_counts()
print('======= Podcast Download Totals =======\n')
print(total_download_series)
print()
# Come up with simple UserAgent to give a basic idea of different agents
ddf['SplitUA'] = ddf['UserAgent'].str.split('/').str.get(0).apply(pd.Series, 1)
splitua_series = ddf['SplitUA'].value_counts()
# Display basic UserAgent breakdown
print('======= Downloads by UserAgent =======\n')
print(splitua_series)
if args.show_geoip_map:
print("Generating geoip map based on downloads...")
build_geoip_map(ddf, rebuild_df=args.download_logs)
@deybhayden
Copy link
Author

You would use this script in conjunction with a static hosted podcast site on S3, and enabling S3 access logs on that same bucket you are hosting your files on.

Enable S3 Static Hosting
Enable S3 Access Logs from AWS Console

@deybhayden
Copy link
Author

If you want to generate the geoip map, you'll need matplotlib and basemap installed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment