hyonschu/gist:9900461

## gistfile1.txt
import pickle
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pymc as pm
import math
import pdb

review = pickle.load(open('../pickles/review.pkl'))
review['date'] = pd.to_datetime(review['date'])
biz = pickle.load(open('../pickles/business.pkl'))
biz = biz.sort(columns="review_count", ascending=False)

########### SET VARIABLES HERE #############

n=100 # restaurant samples
start = 0 # index of restaurant to start at
finish = n+start
users = 20 #how many do you want around inflection point?
user0 = int(math.ceil(users*2/3))  # users to collect BEFORE inflection
user1 = int(math.floor(users*1/3))  # users to collevt AFTER inflection
mcsamples = 2000
burn = 500
skip = 1

########### END VARIABLES

############ INITIALIZE DATA COLLECTING LISTS
tempidxes=[]
influencers=[] # collects all influencers
inflectionbiz=[('business_id', 'inflection_id', 'total_weeks_sampled', 'year-week', 'when samples taken')] # collects inflection points

########### start looping for pymcmc

businessids = [ i for i in biz['business_id'][start:finish]]

for idx, bizid in enumerate(businessids[start:finish]):
    temp = review[review['business_id'] == bizid]
    temp = temp.sort(columns='date', ascending=True)
    temp['wy'] = temp['date'].apply(lambda x: x.strftime("%Y-%W"))
    asdf = temp.groupby(by='wy').count()
    timetemp = pd.Series(asdf.stars.values, index=asdf.index)

########### start pymcmc code here, thanks to @cmrn_dp !

    count_data = asdf.totalnotes.values # count data keeps track of the number of reviews per day
    n_count_data = len(asdf) # n_count_date is just a limit of how many days to keep track of

    alpha = 1.0 / count_data.mean()  # Recall count_data is the
                                   # variable that holds our txt counts
    lambda_1 = pm.Exponential("lambda_1", alpha)
    lambda_2 = pm.Exponential("lambda_2", alpha)

    tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)
    @pm.deterministic
    def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
        out = np.zeros(n_count_data)
        out[:tau] = lambda_1  # lambda before tau is lambda1
        out[tau:] = lambda_2  # lambda after (and including) tau is lambda2
        return out
    observation = pm.Poisson("obs", lambda_, value=count_data, observed=True)
    model = pm.Model([observation, lambda_1, lambda_2, tau])

    mcmc = pm.MCMC(model);
    mcmc.sample(mcsamples, burn, skip); # set samples, burn, and how many to skip - ie 1 is "collect every other tau"
    lambda_1_samples = mcmc.trace('lambda_1')[:];
    lambda_2_samples = mcmc.trace('lambda_2')[:];
    tau_samples = mcmc.trace('tau')[20:]; # cut off point for tau

########### end pymcmc
########### start ending dataset:

    tempidx = Counter(tau_samples).most_common(1)[0][0] # what is the most common tau?
    # if tempidx == len(asdf):
    #     tempidx=tempidx-1
    #print tempidx, len(asdf)
    if tempidx == len(asdf):
        inflectionbiz.append([asdf['business_id'].values[0], "tau is at the end of reviews"])
    elif tempidx == 0 or tempidx == 1:
        inflectionbiz.append(((asdf['business_id'].values[0]), "tau is at the beginning of reviews"))
    else:
        a = temp[temp['wy'] <= asdf.index[tempidx]]
        a = a.sort(columns='wy', ascending=True)[-user0:] #dates are descending. you want the last 10
        b = temp[temp['wy'] > asdf.index[tempidx]]
        b = b.sort(columns='wy', ascending=True)[:user1] #dates are descending. you want the first 5
    try:
        # collect samples before inflection
        for idx in range(len(a)):
            influencers.append([a.iloc[idx]['user_id'], pd.to_datetime(a.iloc[idx]['date']).strftime("%Y-%W")])
        inflectionbiz.append(\
            [ a['business_id'].values[0], tempidx, len(asdf), asdf.index[tempidx], "before inflection" ]\
            )
    except:
        inflectionbiz.append([ a['business_id'].values[0], "no samples before tau"])

    try:
        #collect samples after inflection
        for i in range(len(b)):
            influencers.append([b.iloc[i]['user_id'], pd.to_datetime(b.iloc[i]['date']).strftime("%Y-%W")])
        inflectionbiz.append([ b['business_id'].values[0], tempidx,  len(asdf), asdf.index[tempidx], "after inflection" ])
    except:
        inflectionbiz.append([a['business_id'].values[0], "no samples after tau"])
	import pickle
	import seaborn as sns
	import pandas as pd
	import numpy as np
	from collections import Counter
	import matplotlib.pyplot as plt
	import pymc as pm
	import math
	import pdb

	review = pickle.load(open('../pickles/review.pkl'))
	review['date'] = pd.to_datetime(review['date'])
	biz = pickle.load(open('../pickles/business.pkl'))
	biz = biz.sort(columns="review_count", ascending=False)

	########### SET VARIABLES HERE #############

	n=100 # restaurant samples
	start = 0 # index of restaurant to start at
	finish = n+start
	users = 20 #how many do you want around inflection point?
	user0 = int(math.ceil(users*2/3)) # users to collect BEFORE inflection
	user1 = int(math.floor(users*1/3)) # users to collevt AFTER inflection
	mcsamples = 2000
	burn = 500
	skip = 1

	########### END VARIABLES

	############ INITIALIZE DATA COLLECTING LISTS
	tempidxes=[]
	influencers=[] # collects all influencers
	inflectionbiz=[('business_id', 'inflection_id', 'total_weeks_sampled', 'year-week', 'when samples taken')] # collects inflection points

	########### start looping for pymcmc

	businessids = [ i for i in biz['business_id'][start:finish]]

	for idx, bizid in enumerate(businessids[start:finish]):
	temp = review[review['business_id'] == bizid]
	temp = temp.sort(columns='date', ascending=True)
	temp['wy'] = temp['date'].apply(lambda x: x.strftime("%Y-%W"))
	asdf = temp.groupby(by='wy').count()
	timetemp = pd.Series(asdf.stars.values, index=asdf.index)

	########### start pymcmc code here, thanks to @cmrn_dp !

	count_data = asdf.totalnotes.values # count data keeps track of the number of reviews per day
	n_count_data = len(asdf) # n_count_date is just a limit of how many days to keep track of

	alpha = 1.0 / count_data.mean() # Recall count_data is the
	# variable that holds our txt counts
	lambda_1 = pm.Exponential("lambda_1", alpha)
	lambda_2 = pm.Exponential("lambda_2", alpha)

	tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)
	@pm.deterministic
	def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
	out = np.zeros(n_count_data)
	out[:tau] = lambda_1 # lambda before tau is lambda1
	out[tau:] = lambda_2 # lambda after (and including) tau is lambda2
	return out
	observation = pm.Poisson("obs", lambda_, value=count_data, observed=True)
	model = pm.Model([observation, lambda_1, lambda_2, tau])

	mcmc = pm.MCMC(model);
	mcmc.sample(mcsamples, burn, skip); # set samples, burn, and how many to skip - ie 1 is "collect every other tau"
	lambda_1_samples = mcmc.trace('lambda_1')[:];
	lambda_2_samples = mcmc.trace('lambda_2')[:];
	tau_samples = mcmc.trace('tau')[20:]; # cut off point for tau

	########### end pymcmc
	########### start ending dataset:

	tempidx = Counter(tau_samples).most_common(1)[0][0] # what is the most common tau?
	# if tempidx == len(asdf):
	# tempidx=tempidx-1
	#print tempidx, len(asdf)
	if tempidx == len(asdf):
	inflectionbiz.append([asdf['business_id'].values[0], "tau is at the end of reviews"])
	elif tempidx == 0 or tempidx == 1:
	inflectionbiz.append(((asdf['business_id'].values[0]), "tau is at the beginning of reviews"))
	else:
	a = temp[temp['wy'] <= asdf.index[tempidx]]
	a = a.sort(columns='wy', ascending=True)[-user0:] #dates are descending. you want the last 10
	b = temp[temp['wy'] > asdf.index[tempidx]]
	b = b.sort(columns='wy', ascending=True)[:user1] #dates are descending. you want the first 5
	try:
	# collect samples before inflection
	for idx in range(len(a)):
	influencers.append([a.iloc[idx]['user_id'], pd.to_datetime(a.iloc[idx]['date']).strftime("%Y-%W")])
	inflectionbiz.append(\
	[ a['business_id'].values[0], tempidx, len(asdf), asdf.index[tempidx], "before inflection" ]\
	)
	except:
	inflectionbiz.append([ a['business_id'].values[0], "no samples before tau"])

	try:
	#collect samples after inflection
	for i in range(len(b)):
	influencers.append([b.iloc[i]['user_id'], pd.to_datetime(b.iloc[i]['date']).strftime("%Y-%W")])
	inflectionbiz.append([ b['business_id'].values[0], tempidx, len(asdf), asdf.index[tempidx], "after inflection" ])
	except:
	inflectionbiz.append([a['business_id'].values[0], "no samples after tau"])