Last active
March 23, 2016 16:39
-
-
Save deybhayden/4aa33b0a6c5d8028ab6e to your computer and use it in GitHub Desktop.
You can use this python script (plus pandas) to analyze file download counts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import os | |
import xml.etree.ElementTree as ET | |
from multiprocessing.dummy import Pool | |
import boto3 | |
import requests | |
import pandas as pd | |
BUCKET = 'mah-bukkit' | |
PREFIX = 'podcast_logs' | |
PWD = os.path.dirname(__file__) | |
LOG_DIR = os.path.abspath(os.path.join(PWD, 'podcast_logs')) | |
PODCAST_DF = os.path.abspath(os.path.join(PWD, 'podcast_logs.csv')) | |
GEOIP_DF = os.path.abspath(os.path.join(PWD, 'geoip.csv')) | |
GEOIP_LOOKUP = 'http://api.geoiplookup.net/?query=' | |
POOL_SIZE = 20 # 20 threads in the pool. Tweak this as you see fit. | |
def get_object(work_tuple): | |
(s3, key) = work_tuple | |
obj = s3.get_object(Bucket=BUCKET, Key=key) | |
return (key, obj) | |
def download_logs(): | |
""" | |
Downloads logs from S3 using Boto. | |
""" | |
s3 = boto3.client('s3') | |
paginator = s3.get_paginator('list_objects') | |
pool = Pool(POOL_SIZE) | |
results = pool.imap_unordered(get_object, | |
[(s3, obj['Key']) | |
for chunk in paginator.paginate(Bucket=BUCKET, Prefix=PREFIX) | |
for obj in chunk['Contents']]) | |
for result in results: | |
yield result | |
def create_dataframe(): | |
cols = ['BucketOwner', 'Bucket', 'Time', 'RemoteIP', 'Requester', 'RequestID', | |
'Operation', 'Key', 'RequestURI', 'HTTPStatus', 'ErrorCode', 'BytesSent', | |
'ObjectSize', 'TotalTime', 'TurnAroundTime', 'Referrer', 'UserAgent', 'VersionID'] | |
filelist = os.listdir(LOG_DIR) | |
df_list = [pd.read_table(os.path.join(LOG_DIR, log_file), sep=' ', header=None, index_col=False, names=cols) for log_file in filelist] | |
df = pd.concat(df_list) | |
df.to_csv(PODCAST_DF) | |
return df | |
def extract_lat_long(ip): | |
response = requests.get(GEOIP_LOOKUP + ip) | |
if response.status_code == 200: | |
result_xml = ET.fromstring(response.content.replace("&", "&")) | |
lat = float(result_xml.find('./result/latitude').text) | |
lng = float(result_xml.find('./result/longitude').text) | |
return (lat, lng) | |
def geocode_ip_addresses(ips): | |
pool = Pool(POOL_SIZE) | |
results = pool.imap_unordered(extract_lat_long, | |
[ip for ip in ips]) | |
for result in results: | |
yield result | |
def build_geoip_map(df, rebuild_df=False): | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.basemap import Basemap | |
if not os.path.exists(GEOIP_DF) or rebuild_df: | |
raw_data = {'latitude': [], 'longitude': []} | |
for lat, lng in geocode_ip_addresses(df.RemoteIP): | |
raw_data['latitude'].append(lat) | |
raw_data['longitude'].append(lng) | |
geodf = pd.DataFrame(raw_data) | |
geodf.to_csv(GEOIP_DF) | |
else: | |
geodf = pd.read_csv(GEOIP_DF) | |
plt.figure(figsize=(20,10)) | |
# Create a map, using the Gall Peters projection | |
bmap = Basemap(projection='gall', | |
# with low resolution | |
resolution = 'l', | |
# And threshold 100000 | |
area_thresh = 100000.0, | |
# Centered at 0,0 (i.e null island) | |
lat_0=0, lon_0=0) | |
# Draw the coastlines on the map | |
bmap.drawcoastlines() | |
# Draw country borders on the map | |
bmap.drawcountries() | |
# Fill the land with grey | |
bmap.fillcontinents(color = '#888888') | |
# Draw the map boundaries | |
bmap.drawmapboundary(fill_color='#f4f4f4') | |
# Define our longitude and latitude points | |
# We have to use .values because of a wierd bug when passing pandas data | |
# to basemap. | |
x,y = bmap(geodf['longitude'].values, geodf['latitude'].values) | |
# Plot them using round markers of size 6 | |
bmap.plot(x, y, 'ro', markersize=6) | |
# Show the map | |
plt.show() | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--download-logs', '-d', dest='download_logs', action='store_true', | |
help='Download S3 access logs and create/update podcast dataframe.') | |
parser.add_argument('--show-geoip-map', dest='show_geoip_map', action='store_true', | |
help='Build a map using Geocoded IP addresses of all the downloaded files.') | |
parser.add_argument('--check-missing-files', dest='check_missing', action='store_true', | |
help='Check to see if any files that have been requested responded with 404 not found.') | |
args = parser.parse_args() | |
log_dir_exists = os.path.exists(LOG_DIR) | |
if args.download_logs or not log_dir_exists: | |
args.download_logs = True | |
if not log_dir_exists: | |
os.mkdir(LOG_DIR) | |
for key, s3_obj in download_logs(): | |
with open(key, 'w') as log_file: | |
log_contents = s3_obj['Body'].read() | |
# This is to replace the timestamp brackets with quotes for | |
# the panda's csv reader to understand | |
log_contents = log_contents.replace('[', '"').replace(']', '"') | |
log_file.write(log_contents) | |
dataframe_exists = os.path.exists(PODCAST_DF) | |
if not dataframe_exists or args.download_logs: | |
df = create_dataframe() | |
else: | |
df = pd.read_csv(PODCAST_DF) | |
# Limit the dataframe to download operations only | |
ddf = df[df.Operation == 'WEBSITE.GET.OBJECT'] | |
# Include only our actual podcast files | |
ddf = ddf[ddf.Key.str.contains('m4a')] | |
if args.check_missing: | |
print('======= MISSING Podcast Download Files =======\n') | |
print(ddf[ddf.HTTPStatus == 404]['Key'].value_counts()) | |
print() | |
# Exclude any missing audio file links | |
ddf = ddf[ddf.HTTPStatus != 404] | |
# Display download totals for the files | |
total_download_series = ddf['Key'].value_counts() | |
print('======= Podcast Download Totals =======\n') | |
print(total_download_series) | |
print() | |
# Come up with simple UserAgent to give a basic idea of different agents | |
ddf['SplitUA'] = ddf['UserAgent'].str.split('/').str.get(0).apply(pd.Series, 1) | |
splitua_series = ddf['SplitUA'].value_counts() | |
# Display basic UserAgent breakdown | |
print('======= Downloads by UserAgent =======\n') | |
print(splitua_series) | |
if args.show_geoip_map: | |
print("Generating geoip map based on downloads...") | |
build_geoip_map(ddf, rebuild_df=args.download_logs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You would use this script in conjunction with a static hosted podcast site on S3, and enabling S3 access logs on that same bucket you are hosting your files on.
Enable S3 Static Hosting
Enable S3 Access Logs from AWS Console