Skip to content

Instantly share code, notes, and snippets.

@hyonschu
Created April 2, 2014 00:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hyonschu/9925745 to your computer and use it in GitHub Desktop.
Save hyonschu/9925745 to your computer and use it in GitHub Desktop.
import pickle
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pymc as pm
import math
import pdb
review = pickle.load(open('../pickles/review.pkl'))
review['date'] = pd.to_datetime(review['date'])
biz = pickle.load(open('../pickles/business.pkl'))
biz = biz.sort(columns="review_count", ascending=False)
########### SET VARIABLES HERE #############
n=500 # restaurant samples
start = 0 # index of restaurant to start at
finish = n+start
users = 20 #how many do you want around inflection point?
user0 = int(math.ceil(users*2/3)) # users to collect BEFORE inflection
user1 = int(math.floor(users*1/3)) # users to collevt AFTER inflection
mcsamples = 15000
burn = 5000
skip = 3
########### END VARIABLES
############ INITIALIZE DATA COLLECTING LISTS
tempidxes=[]
influencers=[] # collects all influencers
inflectionbiz=[('business_id', 'inflection_id', 'total_weeks_sampled', 'year-week', 'when samples taken')] # collects inflection points
count = 0 #count how many samples pass
tsk = 0 # number of taus to skip
########### start looping for pymcmc
businessids = [ i for i in biz['business_id'][start:finish]]
print 'no. of businesses =', n, '\nuser count =', users, '\ntrials =', mcsamples, '\nburn =', burn
for idx, bizid in enumerate(businessids[start:finish]):
temp = review[review['business_id'] == bizid]
temp = temp.sort(columns='date', ascending=True)
temp['wy'] = temp['date'].apply(lambda x: x.strftime("%Y-%W"))
asdf = temp.groupby(by='wy').count()
timetemp = pd.Series(asdf.stars.values, index=asdf.index)
########### start pymcmc code here, thanks to @cmrn_dp !
count_data = asdf.totalnotes.values # count data keeps track of the number of reviews per day
n_count_data = len(asdf) # n_count_date is just a limit of how many days to keep track of
alpha = 1.0 / count_data.mean() # Recall count_data is the
# variable that holds our txt counts
lambda_1 = pm.Exponential("lambda_1", alpha)
lambda_2 = pm.Exponential("lambda_2", alpha)
tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)
@pm.deterministic
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
out = np.zeros(n_count_data)
out[:tau] = lambda_1 # lambda before tau is lambda1
out[tau:] = lambda_2 # lambda after (and including) tau is lambda2
return out
observation = pm.Poisson("obs", lambda_, value=count_data, observed=True)
model = pm.Model([observation, lambda_1, lambda_2, tau])
mcmc = pm.MCMC(model);
mcmc.sample(mcsamples, burn, skip); # set samples, burn, and how many to skip - ie 1 is "collect every other tau"
lambda_1_samples = mcmc.trace('lambda_1')[:];
lambda_2_samples = mcmc.trace('lambda_2')[:];
tau_samples = mcmc.trace('tau')[:]; # cut off point for tau
########### end pymcmc
########### start ending dataset:
tempidx = Counter(tau_samples).most_common(1)[0][0] # what is the most common tau?
# if tempidx == len(asdf):
# tempidx=tempidx-1
#print tempidx, len(asdf)
i=1
while tempidx > 2 and tempidx <= len(asdf):
i += 1
tempidx = Counter(tau_samples).most_common(i)[i-1][0]
print 'adhksfv adjksh fjk'
if tempidx == len(asdf):
inflectionbiz.append([asdf['business_id'].values[0], "tau is at the end of reviews"])
elif whiletemp
else:
a = temp[temp['wy'] <= asdf.index[tempidx]]
a = a.sort(columns='wy', ascending=True)#[-user0:] #dates are descending. you want the last 10
b = temp[temp['wy'] > asdf.index[tempidx]]
b = b.sort(columns='wy', ascending=True)#[:user1] #dates are descending. you want the first 5
influencers.append([ i for i in a[-user0:]['user_id']])
influencers.append([ i for i in a[:user1]['user_id']])
count += 1
inflectionbiz.append(((asdf['business_id'].values[0]), "tau is at the beginning of reviews"))
else:
i = 1
while tempidx < 2 and tempidx >= len(asdf):
i += 1
tempidx = Counter(tau_samples).most_common(i)[i-1][0]
a = temp[temp['wy'] <= asdf.index[tempidx]]
a = a.sort(columns='wy', ascending=True)#[-user0:] #dates are descending. you want the last 10
b = temp[temp['wy'] > asdf.index[tempidx]]
b = b.sort(columns='wy', ascending=True)#[:user1] #dates are descending. you want the first 5
influencers.append([ i for i in a[-user0:]['user_id']])
influencers.append([ asdf.index[tempidx], temp['wy']
])
influencers.append([ i for i in a[:user1]['user_id']])
count += 1
inflectionbiz.append([ a['business_id'].values[0], tempidx, len(asdf), asdf.index[tempidx], "before inflection" ])
# if tempidx == len(asdf):
# elif tempidx == 0 or tempidx == 1:
# tempidx = Counter(tau_samples).most_common(2)[1][0]
# if tempidx == len(asdf):
# inflectionbiz.append([asdf['business_id'].values[0], "tau is at the end of reviews"])
# else:
# else:
# a = temp[temp['wy'] <= asdf.index[tempidx]]
# a = a.sort(columns='wy', ascending=True)#[-user0:] #dates are descending. you want the last 10
# b = temp[temp['wy'] > asdf.index[tempidx]]
# b = b.sort(columns='wy', ascending=True)#[:user1] #dates are descending. you want the first 5
# influencers.append([ i for i in a[-user0:]['user_id']])
# influencers.append([ i for i in a[:user1]['user_id']])
# count += 1
# try:
# # collect samples before inflection
# for idx in range(len(a)):
# influencers.append([a.iloc[idx]['user_id'], pd.to_datetime(a.iloc[idx]['date']).strftime("%Y-%W")])
# inflectionbiz.append(\
# [ a['business_id'].values[0], tempidx, len(asdf), asdf.index[tempidx], "before inflection" ])
# except:
# inflectionbiz.append([ a['business_id'].values[0], "no samples before tau"])
# try:
# #collect samples after inflection
# for i in range(len(b)):
# influencers.append([b.iloc[i]['user_id'], pd.to_datetime(b.iloc[i]['date']).strftime("%Y-%W")])
# inflectionbiz.append([ b['business_id'].values[0], tempidx, len(asdf), asdf.index[tempidx], "after inflection" ])
# except:
# inflectionbiz.append([a['business_id'].values[0], "no samples after tau"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment