Skip to content

Instantly share code, notes, and snippets.

@greeness
Last active August 29, 2015 14:02
Show Gist options
  • Save greeness/de77fb847c1317fc256e to your computer and use it in GitHub Desktop.
Save greeness/de77fb847c1317fc256e to your computer and use it in GitHub Desktop.
step 1.
from datetime import datetime
import numpy as np
age_ranges = ['13-24', '25-35', '36-52', '53-64']
device_options = ['Phone', 'Tablet']
platform_options = ['a', 'i']
def weighted_choice(weights):
totals = np.cumsum(weights)
norm = totals[-1]
throw = np.random.rand()*norm
index = np.searchsorted(totals, throw)
return index
def regularize_device(device):
device = device.strip()
if device in device_options:
return device
return device_options[weighted_choice([64489, 17647])]
def regularized_age_range(age):
if age in age_ranges:
return age
if age in ['13-18', '19-24']:
return '13-24'
return age_ranges[weighted_choice([7567, 16106, 28178, 24051])]
def regularized_platform(platform):
if platform in platform_options:
return platform
return platform[weighted_choice([51215, 32184])]
users = {}
for line in open('demo_uniq.csv'):
columns = line.strip().split(',')
if len(columns) != 7:
print line
continue
try:
user = {}
user_id, install_date, platform, provider, gender, age_range, device = columns
install_date = datetime.strptime(install_date, "%m/%d/%y")
delta_days = (install_date - datetime.strptime("1/1/14", "%m/%d/%y")).days
user["install"] = delta_days
user["platform"] = regularized_platform(platform)
user["gender"] = 'UNKNOWN' if gender == '' else gender
user["device"] = regularize_device(device)
user["age"] = regularized_age_range(age_range)
users[user_id] = user
except:
pass
for line in open('payer_uniq.csv'):
try:
user_id, is_payer = line.strip().split(',')
if users.has_key(user_id):
users[user_id]["is_payer"] = int(is_payer)
print users[user_id]
except:
pass
import json
json.dump(users, open('users.json', 'w'), indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment