Skip to content

Instantly share code, notes, and snippets.

@hyonschu
Created March 28, 2014 01:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hyonschu/9823465 to your computer and use it in GitHub Desktop.
Save hyonschu/9823465 to your computer and use it in GitHub Desktop.
import pickle
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pymc as pm
review = pickle.load(open('pickles/review.pkl'))
review['date'] = pd.to_datetime(review['date'])
biz = pickle.load(open('pickles/business.pkl'))
biz = biz.sort(column="review_count", ascending=False)
businessids = [ i for i in biz['business_id'][:500]]
influencers=[]
allreviews=[]
start = 0
n=50
finish = start+n
for idx, bizid in enumerate(businessids[start:finish]):
temp = review[review['business_id'] == bizid]
temp = temp.sort(column='date')
asdf = temp.groupby(by='date').count()
timetemp = pd.Series(asdf.stars.values, index=asdf.index)
# timetemp.plot()
count_data = timetemp.values # count data keeps track of the number of reviews per day
n_count_data = len(timetemp.values) # n_count_date is just a limit of how many days to keep track of
alpha = 1.0 / count_data.mean() # Recall count_data is the
# variable that holds our txt counts
lambda_1 = pm.Exponential("lambda_1", alpha)
lambda_2 = pm.Exponential("lambda_2", alpha)
tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)
@pm.deterministic
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
out = np.zeros(n_count_data)
out[:tau] = lambda_1 # lambda before tau is lambda1
out[tau:] = lambda_2 # lambda after (and including) tau is lambda2
return out
observation = pm.Poisson("obs", lambda_, value=count_data, observed=True)
model = pm.Model([observation, lambda_1, lambda_2, tau])
mcmc = pm.MCMC(model)
mcmc.sample(40000, 10000, 3)
lambda_1_samples = mcmc.trace('lambda_1')[:]
lambda_2_samples = mcmc.trace('lambda_2')[:]
tau_samples = mcmc.trace('tau')[15:] # cut off point for tau
tempidx = Counter(tau_samples).most_common(1)[0][0]
print " ", temp.iloc[tempidx]['date'], tempidx
try:
a = temp[temp['date'] <= timetemp.index[tempidx]]
for i in a['user_id'][:10]:
influencers.append(i)
except Exception:
influencers.append("not enough samples for business % s" % (bizid))
try:
b = temp[temp['date'] > timetemp.index[tempidx]]
for i in b['user_id'][-5:]:
influencers.append(i)
except Exception:
influencers.append("not enough samples for business % s" % (bizid))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment