Skip to content

Instantly share code, notes, and snippets.

@clemesha
Created September 24, 2011 20:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clemesha/1239826 to your computer and use it in GitHub Desktop.
Save clemesha/1239826 to your computer and use it in GitHub Desktop.
import sys
import time
import json
from glob import glob
from urlparse import urlparse
from itertools import groupby
from collections import defaultdict
DAY = 15
fileName="usagov_bitly"
def parse_dayslice(dataslice_file_name):
all_data = []
datapoints_dayslice = open(dataslice_file_name).read().split("\n")
win, fail = 0, 0
domain_count = defaultdict(int)
urls_for_domain = defaultdict(list)
for datapoint in datapoints_dayslice:
try:
entry = json.loads(datapoint)
url = entry["u"]
brows = entry["a"]
timestamp = entry["t"]
try:
lat, lon = entry["ll"]
win += 1
except:
lat, lon = 9999,9999 #value for entries with no lat, long
fail += 1
browsval=0
for i in brows:
browsval+=ord(i)
parsed_result = urlparse(url)
root_domain = ".".join(parsed_result.netloc.split(".")[-2:])
domain_count[root_domain] += 1
urls_for_domain[root_domain + "__urls"].append(url)
#path_elems = url.replace("https://", "").replace("http://", "").split("/")
all_data.append([float(timestamp), root_domain, url, lat, lon, browsval])
#if len(path_elems) > 0: #time.sleep(0.05)
#print timestamp, lat, lon, browsval
except ValueError:
pass #print "Failed to parse:\n", datapoint
#print "win, fail: ", win, fail
return all_data, domain_count, urls_for_domain
def _ordered_urls_for_root_domain(root_domain, data):
result = []
for (timestamp, root, url, lat, lon, browsval) in sorted(data, key=lambda x:x[0]):
if root == root_domain:
result.append([timestamp, root, url, lat, lon, browsval])
return result
def full_day_ordered_urls_for_root_domain(root_domain, full_day_datafile_regex):
""" Ordered by 'timestamp'. """
result = []
all_dataslice_files = sorted(glob(full_day_datafile_regex))
for filename in all_dataslice_files:
print "PROCESSING: ",filename, "..."
slice_data, domain_count, urls_for_domain = parse_dayslice(filename)
slice_data_ordered = _ordered_urls_for_root_domain(root_domain, slice_data)
result.extend(slice_data_ordered)
return result
def top_urls_for_day_and_root_domain(data, topn=10):
only_urls = [e[2] for e in data]
unique_urls = set(only_urls)
counts = [(e, only_urls.count(e)) for e in unique_urls]
result = sorted(counts, key=lambda x:x[1], reverse=True)
if len(result) >= topn:
return result[0:topn]
else:
return result
def bin_data(full_day_data_topn_only, topn_urls, bins):
result = defaultdict(list)
delta = (24*60*60) / bins # 21 seconds --
start_timestamp, end_timestamp = full_day_data_topn_only[0][0], full_day_data_topn_only[-1][0]
current_bin_max_timestamp = start_timestamp + delta
url_count_for_bin = dict([(url, 0) for url in topn_urls]) #reset
while full_day_data_topn_only:
current_data_element = full_day_data_topn_only.pop(0)
current_data_timestamp = current_data_element[0]
if current_data_timestamp <= current_bin_max_timestamp:
url = current_data_element[2]
url_count_for_bin[url] += 1
else: #increment to next bin
current_bin_max_timestamp += delta
for (url, count) in url_count_for_bin.iteritems():
result[url].append(count)
url_count_for_bin = dict([(url, 0) for url in topn_urls]) #reset
return result
def write_csound_files(data, root_domain, date_str, bins):
for (url, clicks) in data.iteritems():
fname = date_str + "-" + root_domain + "-" + url.replace("/","_") + ".cso"
fhandle = open(fname, "w")
max_clicks = max(clicks)
normalized_clicks = [str(float(click)/float(max_clicks)) for click in clicks]
clicks_str = " ".join(normalized_clicks)
fhandle.write("f 1 0 4096 2 "+clicks_str)
fhandle.close()
if __name__ == "__main__":
print sys.argv
if len(sys.argv) == 5:
root_domain, topn, date_str, bins = sys.argv[1:]
print "Using root_domain='%s', topn='%s', date_str='%s' bins='%s'" % (root_domain, topn, date_str, bins)
else:
root_domain, topn, date_str, bins = "nasa.gov", "5", "2011-07-25", "4096"
print "[DEFAULT] Using root_domain='%s', topn='%s', date_str='%s' bins='%s'" % (root_domain, topn, date_str, bins)
topn, bins = int(topn), int(bins)
time.sleep(2)
glob_str = "usagov_bitly_data%s-*" % date_str
_full_day_data = full_day_ordered_urls_for_root_domain(root_domain, glob_str)
_top_urls_count = top_urls_for_day_and_root_domain(_full_day_data, topn=topn)
_top_urls = [e[0] for e in _top_urls_count]
_full_day_data_topn_only = []
print "top_urls: ", _top_urls, "\n"
for e in _full_day_data:
if (e[2] in _top_urls):
_full_day_data_topn_only.append(e)
print "full_day_data length: ", len(_full_day_data)
print "full_day_data_topn_only length: ", len(_full_day_data_topn_only)
time.sleep(1)
final_data = bin_data(_full_day_data_topn_only, _top_urls, bins)
write_csound_files(final_data, root_domain, date_str, bins)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment