deybhayden/podcast_download_analyzer.py

## podcast_download_analyzer.py
#!/usr/bin/env python
from __future__ import print_function

import os
import xml.etree.ElementTree as ET
from multiprocessing.dummy import Pool

import boto3
import requests
import pandas as pd

BUCKET = 'mah-bukkit'
PREFIX = 'podcast_logs'
PWD = os.path.dirname(__file__)
LOG_DIR = os.path.abspath(os.path.join(PWD, 'podcast_logs'))
PODCAST_DF = os.path.abspath(os.path.join(PWD, 'podcast_logs.csv'))
GEOIP_DF = os.path.abspath(os.path.join(PWD, 'geoip.csv'))
GEOIP_LOOKUP = 'http://api.geoiplookup.net/?query='
POOL_SIZE = 20 # 20 threads in the pool. Tweak this as you see fit.


def get_object(work_tuple):
    (s3, key) = work_tuple
    obj = s3.get_object(Bucket=BUCKET, Key=key)
    return (key, obj)


def download_logs():
    """
    Downloads logs from S3 using Boto.
    """
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects')

    pool = Pool(POOL_SIZE)
    results = pool.imap_unordered(get_object,
                                  [(s3, obj['Key'])
                                   for chunk in paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
                                   for obj in chunk['Contents']])
    for result in results:
        yield result


def create_dataframe():
    cols = ['BucketOwner', 'Bucket', 'Time', 'RemoteIP', 'Requester', 'RequestID',
            'Operation', 'Key', 'RequestURI', 'HTTPStatus', 'ErrorCode', 'BytesSent',
            'ObjectSize', 'TotalTime', 'TurnAroundTime', 'Referrer', 'UserAgent', 'VersionID']

    filelist = os.listdir(LOG_DIR)
    df_list = [pd.read_table(os.path.join(LOG_DIR, log_file), sep=' ', header=None, index_col=False, names=cols) for log_file in filelist]
    df = pd.concat(df_list)
    df.to_csv(PODCAST_DF)

    return df


def extract_lat_long(ip):
    response = requests.get(GEOIP_LOOKUP + ip)
    if response.status_code == 200:
        result_xml = ET.fromstring(response.content.replace("&", "&amp;"))
        lat = float(result_xml.find('./result/latitude').text)
        lng = float(result_xml.find('./result/longitude').text)
        return (lat, lng)


def geocode_ip_addresses(ips):
    pool = Pool(POOL_SIZE)
    results = pool.imap_unordered(extract_lat_long,
                                  [ip for ip in ips])
    for result in results:
        yield result


def build_geoip_map(df, rebuild_df=False):
    import matplotlib.pyplot as plt
    from mpl_toolkits.basemap import Basemap

    if not os.path.exists(GEOIP_DF) or rebuild_df:
        raw_data = {'latitude': [], 'longitude': []}
        for lat, lng in geocode_ip_addresses(df.RemoteIP):
            raw_data['latitude'].append(lat)
            raw_data['longitude'].append(lng)

        geodf = pd.DataFrame(raw_data)
        geodf.to_csv(GEOIP_DF)
    else:
        geodf = pd.read_csv(GEOIP_DF)

    plt.figure(figsize=(20,10))

    # Create a map, using the Gall Peters projection
    bmap = Basemap(projection='gall',
                  # with low resolution
                  resolution = 'l',
                  # And threshold 100000
                  area_thresh = 100000.0,
                  # Centered at 0,0 (i.e null island)
                  lat_0=0, lon_0=0)

    # Draw the coastlines on the map
    bmap.drawcoastlines()

    # Draw country borders on the map
    bmap.drawcountries()

    # Fill the land with grey
    bmap.fillcontinents(color = '#888888')

    # Draw the map boundaries
    bmap.drawmapboundary(fill_color='#f4f4f4')

    # Define our longitude and latitude points
    # We have to use .values because of a wierd bug when passing pandas data
    # to basemap.
    x,y = bmap(geodf['longitude'].values, geodf['latitude'].values)

    # Plot them using round markers of size 6
    bmap.plot(x, y, 'ro', markersize=6)

    # Show the map
    plt.show()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--download-logs', '-d', dest='download_logs', action='store_true',
                        help='Download S3 access logs and create/update podcast dataframe.')
    parser.add_argument('--show-geoip-map', dest='show_geoip_map', action='store_true',
                        help='Build a map using Geocoded IP addresses of all the downloaded files.')
    parser.add_argument('--check-missing-files', dest='check_missing', action='store_true',
                        help='Check to see if any files that have been requested responded with 404 not found.')
    args = parser.parse_args()

    log_dir_exists = os.path.exists(LOG_DIR)
    if args.download_logs or not log_dir_exists:
        args.download_logs = True
        if not log_dir_exists:
            os.mkdir(LOG_DIR)

        for key, s3_obj in download_logs():
            with open(key, 'w') as log_file:
                log_contents = s3_obj['Body'].read()
                # This is to replace the timestamp brackets with quotes for
                # the panda's csv reader to understand
                log_contents = log_contents.replace('[', '"').replace(']', '"')
                log_file.write(log_contents)

    dataframe_exists = os.path.exists(PODCAST_DF)
    if not dataframe_exists or args.download_logs:
        df = create_dataframe()
    else:
        df = pd.read_csv(PODCAST_DF)

    # Limit the dataframe to download operations only
    ddf = df[df.Operation == 'WEBSITE.GET.OBJECT']
    # Include only our actual podcast files
    ddf = ddf[ddf.Key.str.contains('m4a')]

    if args.check_missing:
        print('======= MISSING Podcast Download Files =======\n')
        print(ddf[ddf.HTTPStatus == 404]['Key'].value_counts())
        print()

    # Exclude any missing audio file links
    ddf = ddf[ddf.HTTPStatus != 404]
    # Display download totals for the files
    total_download_series = ddf['Key'].value_counts()
    print('======= Podcast Download Totals =======\n')
    print(total_download_series)
    print()

    # Come up with simple UserAgent to give a basic idea of different agents
    ddf['SplitUA'] = ddf['UserAgent'].str.split('/').str.get(0).apply(pd.Series, 1)
    splitua_series = ddf['SplitUA'].value_counts()
    # Display basic UserAgent breakdown
    print('======= Downloads by UserAgent =======\n')
    print(splitua_series)

    if args.show_geoip_map:
        print("Generating geoip map based on downloads...")
        build_geoip_map(ddf, rebuild_df=args.download_logs)
	#!/usr/bin/env python
	from __future__ import print_function

	import os
	import xml.etree.ElementTree as ET
	from multiprocessing.dummy import Pool

	import boto3
	import requests
	import pandas as pd

	BUCKET = 'mah-bukkit'
	PREFIX = 'podcast_logs'
	PWD = os.path.dirname(__file__)
	LOG_DIR = os.path.abspath(os.path.join(PWD, 'podcast_logs'))
	PODCAST_DF = os.path.abspath(os.path.join(PWD, 'podcast_logs.csv'))
	GEOIP_DF = os.path.abspath(os.path.join(PWD, 'geoip.csv'))
	GEOIP_LOOKUP = 'http://api.geoiplookup.net/?query='
	POOL_SIZE = 20 # 20 threads in the pool. Tweak this as you see fit.


	def get_object(work_tuple):
	(s3, key) = work_tuple
	obj = s3.get_object(Bucket=BUCKET, Key=key)
	return (key, obj)


	def download_logs():
	"""
	Downloads logs from S3 using Boto.
	"""
	s3 = boto3.client('s3')
	paginator = s3.get_paginator('list_objects')

	pool = Pool(POOL_SIZE)
	results = pool.imap_unordered(get_object,
	[(s3, obj['Key'])
	for chunk in paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
	for obj in chunk['Contents']])
	for result in results:
	yield result


	def create_dataframe():
	cols = ['BucketOwner', 'Bucket', 'Time', 'RemoteIP', 'Requester', 'RequestID',
	'Operation', 'Key', 'RequestURI', 'HTTPStatus', 'ErrorCode', 'BytesSent',
	'ObjectSize', 'TotalTime', 'TurnAroundTime', 'Referrer', 'UserAgent', 'VersionID']

	filelist = os.listdir(LOG_DIR)
	df_list = [pd.read_table(os.path.join(LOG_DIR, log_file), sep=' ', header=None, index_col=False, names=cols) for log_file in filelist]
	df = pd.concat(df_list)
	df.to_csv(PODCAST_DF)

	return df


	def extract_lat_long(ip):
	response = requests.get(GEOIP_LOOKUP + ip)
	if response.status_code == 200:
	result_xml = ET.fromstring(response.content.replace("&", "&"))
	lat = float(result_xml.find('./result/latitude').text)
	lng = float(result_xml.find('./result/longitude').text)
	return (lat, lng)


	def geocode_ip_addresses(ips):
	pool = Pool(POOL_SIZE)
	results = pool.imap_unordered(extract_lat_long,
	[ip for ip in ips])
	for result in results:
	yield result


	def build_geoip_map(df, rebuild_df=False):
	import matplotlib.pyplot as plt
	from mpl_toolkits.basemap import Basemap

	if not os.path.exists(GEOIP_DF) or rebuild_df:
	raw_data = {'latitude': [], 'longitude': []}
	for lat, lng in geocode_ip_addresses(df.RemoteIP):
	raw_data['latitude'].append(lat)
	raw_data['longitude'].append(lng)

	geodf = pd.DataFrame(raw_data)
	geodf.to_csv(GEOIP_DF)
	else:
	geodf = pd.read_csv(GEOIP_DF)

	plt.figure(figsize=(20,10))

	# Create a map, using the Gall Peters projection
	bmap = Basemap(projection='gall',
	# with low resolution
	resolution = 'l',
	# And threshold 100000
	area_thresh = 100000.0,
	# Centered at 0,0 (i.e null island)
	lat_0=0, lon_0=0)

	# Draw the coastlines on the map
	bmap.drawcoastlines()

	# Draw country borders on the map
	bmap.drawcountries()

	# Fill the land with grey
	bmap.fillcontinents(color = '#888888')

	# Draw the map boundaries
	bmap.drawmapboundary(fill_color='#f4f4f4')

	# Define our longitude and latitude points
	# We have to use .values because of a wierd bug when passing pandas data
	# to basemap.
	x,y = bmap(geodf['longitude'].values, geodf['latitude'].values)

	# Plot them using round markers of size 6
	bmap.plot(x, y, 'ro', markersize=6)

	# Show the map
	plt.show()


	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument('--download-logs', '-d', dest='download_logs', action='store_true',
	help='Download S3 access logs and create/update podcast dataframe.')
	parser.add_argument('--show-geoip-map', dest='show_geoip_map', action='store_true',
	help='Build a map using Geocoded IP addresses of all the downloaded files.')
	parser.add_argument('--check-missing-files', dest='check_missing', action='store_true',
	help='Check to see if any files that have been requested responded with 404 not found.')
	args = parser.parse_args()

	log_dir_exists = os.path.exists(LOG_DIR)
	if args.download_logs or not log_dir_exists:
	args.download_logs = True
	if not log_dir_exists:
	os.mkdir(LOG_DIR)

	for key, s3_obj in download_logs():
	with open(key, 'w') as log_file:
	log_contents = s3_obj['Body'].read()
	# This is to replace the timestamp brackets with quotes for
	# the panda's csv reader to understand
	log_contents = log_contents.replace('[', '"').replace(']', '"')
	log_file.write(log_contents)

	dataframe_exists = os.path.exists(PODCAST_DF)
	if not dataframe_exists or args.download_logs:
	df = create_dataframe()
	else:
	df = pd.read_csv(PODCAST_DF)

	# Limit the dataframe to download operations only
	ddf = df[df.Operation == 'WEBSITE.GET.OBJECT']
	# Include only our actual podcast files
	ddf = ddf[ddf.Key.str.contains('m4a')]

	if args.check_missing:
	print('======= MISSING Podcast Download Files =======\n')
	print(ddf[ddf.HTTPStatus == 404]['Key'].value_counts())
	print()

	# Exclude any missing audio file links
	ddf = ddf[ddf.HTTPStatus != 404]
	# Display download totals for the files
	total_download_series = ddf['Key'].value_counts()
	print('======= Podcast Download Totals =======\n')
	print(total_download_series)
	print()

	# Come up with simple UserAgent to give a basic idea of different agents
	ddf['SplitUA'] = ddf['UserAgent'].str.split('/').str.get(0).apply(pd.Series, 1)
	splitua_series = ddf['SplitUA'].value_counts()
	# Display basic UserAgent breakdown
	print('======= Downloads by UserAgent =======\n')
	print(splitua_series)

	if args.show_geoip_map:
	print("Generating geoip map based on downloads...")
	build_geoip_map(ddf, rebuild_df=args.download_logs)