Created
April 2, 2014 22:46
-
-
Save hyonschu/9944874 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
from collections import Counter | |
import matplotlib.pyplot as plt | |
import pymc as pm | |
import math | |
import pdb | |
review = pickle.load(open('../pickles/review.pkl')) | |
review['date'] = pd.to_datetime(review['date']) | |
biz = pickle.load(open('../pickles/business.pkl')) | |
biz = biz.sort(columns="review_count", ascending=False) | |
print biz[:3] | |
########### SET VARIABLES HERE ############# | |
n=500 # restaurant samples | |
start = 0 # index of restaurant to start at | |
finish = n+start | |
users = 20 #how many do you want around inflection point? | |
user0 = int(math.ceil(users*2/3)) # users to collect BEFORE inflection | |
user1 = int(math.floor(users*1/3)) # users to collevt AFTER inflection | |
mcsamples = 10000 | |
burn = 5000 | |
skip = 1 | |
loop = 0 | |
########### END VARIABLES | |
############ INITIALIZE DATA COLLECTING LISTS | |
tempidxes=[] | |
businesses=[] | |
influencers=[] # collects all influencers | |
inflectionbiz=[('business_id', 'inflection_id', 'total_weeks_sampled', 'year-week', 'when samples taken')] # collects inflection points | |
count = 0 #count how many samples pass | |
tsk = 0 # number of taus to skip | |
########### start looping for pymcmc | |
businessids = [ i for i in biz['business_id'][start:finish]] | |
print 'no. of businesses =', n, '\nuser count =', users, '\ntrials =', mcsamples, '\nburn =', burn | |
for idx, bizid in enumerate(businessids[start:finish]): | |
temp = review[review['business_id'] == bizid] | |
temp = temp.sort(columns='date', ascending=True) | |
temp['wy'] = temp['date'].apply(lambda x: x.strftime("%Y-%W")) | |
asdf = temp.groupby(by='wy').count() | |
timetemp = pd.Series(asdf.stars.values, index=asdf.index) | |
########### start pymcmc code here, thanks to @cmrn_dp ! | |
count_data = asdf.totalnotes.values # count data keeps track of the number of reviews per day | |
n_count_data = len(asdf) # n_count_date is just a limit of how many days to keep track of | |
alpha = 1.0 / count_data.mean() # Recall count_data is the | |
# variable that holds our txt counts | |
lambda_1 = pm.Exponential("lambda_1", alpha) | |
lambda_2 = pm.Exponential("lambda_2", alpha) | |
tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data) | |
@pm.deterministic | |
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2): | |
out = np.zeros(n_count_data) | |
out[:tau] = lambda_1 # lambda before tau is lambda1 | |
out[tau:] = lambda_2 # lambda after (and including) tau is lambda2 | |
return out | |
observation = pm.Poisson("obs", lambda_, value=count_data, observed=True) | |
model = pm.Model([observation, lambda_1, lambda_2, tau]) | |
mcmc = pm.MCMC(model); | |
mcmc.sample(mcsamples, burn, skip); # set samples, burn, and how many to skip - ie 1 is "collect every other tau" | |
lambda_1_samples = mcmc.trace('lambda_1')[:]; | |
lambda_2_samples = mcmc.trace('lambda_2')[:]; | |
tau_samples = mcmc.trace('tau')[15:]; # cut off point for tau | |
########### end pymcmc | |
########### start ending dataset: | |
tempidx = Counter(tau_samples).most_common(1)[0][0] # what is the most common tau? | |
# if tempidx == len(asdf): | |
# tempidx=tempidx-1 | |
#print tempidx, len(asdf) | |
if len(Counter(tau_samples)) == 1: | |
tempidx = 0 | |
else: | |
i=1 | |
while tempidx < 3 or tempidx == len(asdf): | |
counter = 0 | |
i += 1 | |
tempidx = Counter(tau_samples).most_common(i)[i-1][0] | |
counter += 1 | |
if counter > 8: | |
tempidx = 0 | |
if tempidx == 0: | |
pass | |
if tempidx==0: | |
influencers.append( ['no inflection found for business %s' % bizid ]) | |
else: | |
a = temp[temp['wy'] <= asdf.index[tempidx]] | |
a = a.sort(columns='wy', ascending=True)#[-user0:] #dates are descending. you want the last 10 | |
b = temp[temp['wy'] > asdf.index[tempidx]] | |
b = b.sort(columns='wy', ascending=True)#[:user1] #dates are descending. you want the first 5 | |
businesses.append(a['business_id'][:1].values[0]) | |
influencers.append( [i for i in a[-user0:]['user_id'].values]+[j for j in b[:user1]['user_id'].values] ) | |
inflectionbiz.append([ a['business_id'][:1].values[0], tempidx, len(asdf), asdf.index[tempidx] ]) | |
count += 1 | |
loop += 1 | |
if loop % 100 == 0: | |
print "\n", loop, "done" | |
c = [item for sublist in influencers for item in sublist] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment