Create a gist now

Instantly share code, notes, and snippets.

To compare EM algorithm and MCMC on GMM training.
import numpy as np
from sklearn import cross_validation, mixture
import pickle
import os
import pystan
import time
import matplotlib.pyplot as plt
def dump_stan_model(stan_model, compiled_file_name):
"""
Dump compiled Stan model.
:param stan_model: compiled stan model instance
:param compiled_file_name: pickled file name (output)
"""
f = open(compiled_file_name, 'wb')
with f:
pickle.dump(stan_model, f)
def load_stan_model(compiled_file_name):
"""
Load compiled Stan model.
:param compiled_file_name: pickled stan model file
:return: loaded stan model instance
"""
f = open(compiled_file_name, 'rb')
with f:
stan_model = pickle.load(f)
return stan_model
def convert_model(stan_gmm_model):
"""
Convert a GMM model from Stan format to scikit-learn format.
:param stan_gmm_model: Stan's optimized result
:return: mixture.GMM instance from scikit-learn
"""
num_mixture_components = stan_gmm_model.get('weights').size
gmm = mixture.GMM(n_components=num_mixture_components, covariance_type='diag')
gmm.weights_ = stan_gmm_model.get('weights')
gmm.means_ = stan_gmm_model.get('mu')
gmm.covars_ = np.square(stan_gmm_model.get('sigma'))
return gmm
def draw_result_graph(likelihoods_em, likelihoods_mcmc):
"""
Draw a graph that shows likelihood of EM vs. MCMC
:param likelihoods_em:
:param likelihoods_mcmc:
:return:
"""
fix, ax = plt.subplots()
ax.scatter(likelihoods_em, likelihoods_mcmc, marker='o')
ax.plot([-7.5, -6.0], [-7.5, -6.0], color='gray', alpha=0.5)
plt.xlabel("Average Log Likelihood (EM)")
plt.ylabel("Average Log Likelihood (MCMC)")
plt.show()
def main():
# prepare data
data_file_name = 'winequality-white.csv'
raw_data_set = np.loadtxt(data_file_name, delimiter=";", skiprows=1)
data_set = raw_data_set[:, :11] # remove "quority" column
# load stan model
stan_code_file_name = 'multi_dimensional_gmm_diagonal.stan'
stan_compiled_file_name = 'multi_dimensional_gmm_diagonal.pkl'
if os.path.isfile(stan_compiled_file_name):
stan_model = load_stan_model(stan_compiled_file_name)
else:
stan_model = pystan.StanModel(file=stan_code_file_name)
dump_stan_model(stan_model, stan_compiled_file_name)
# cross validation
num_validations = 500
num_mixture_components = 4
cnt = 0
time_sec_em = 0.0
time_sec_mcmc = 0.0
likelihoods_em = []
likelihoods_mcmc = []
ss = cross_validation.ShuffleSplit(n=len(data_set.data), n_iter=num_validations, test_size=0.5)
for training_indexes, evaluation_indexes in ss:
cnt += 1
print("--------------------------------")
print("ITERATION {0}".format(cnt))
print("--------------------------------")
# separate data by ShuffleSplit results
tr_data_set = data_set[training_indexes]
ev_data_set = data_set[evaluation_indexes]
# run EM algorithm by scikit-learn
gmm_em = mixture.GMM(n_components=num_mixture_components, covariance_type='diag')
t = time.time()
gmm_em.fit(tr_data_set)
time_sec_em += time.time() - t
likelihoods_em.append(gmm_em.score(ev_data_set).mean())
# run MCMC by PyStan
data_dic = dict(D=tr_data_set.shape[1], N=tr_data_set.shape[0], M=num_mixture_components, X=tr_data_set)
t = time.time()
optimizing_result = stan_model.optimizing(data=data_dic, iter=20000)
time_sec_mcmc += time.time() - t
gmm_mcmc = convert_model(optimizing_result)
likelihoods_mcmc.append(gmm_mcmc.score(ev_data_set).mean())
print("--------------------------------")
print("COMPLETED")
print("--------------------------------")
print("likelihoods_em:", likelihoods_em)
print("likelihoods_mcmc:", likelihoods_mcmc)
print("avg time em: {0:.3f} sec".format(time_sec_em / num_validations))
print("avg time mcmc: {0:.3f} sec".format(time_sec_mcmc / num_validations))
draw_result_graph(likelihoods_em, likelihoods_mcmc)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment