Skip to content

Instantly share code, notes, and snippets.

def _m_step(self, X, gamma):
"""Performs M-step of the GMM
We need to update our priors, our means
and our covariance matrix.
Parameters:
-----------
X: (N x d), data
gamma: (N x C), posterior distribution of lower bound
@DFoly
DFoly / e_step.py
Last active April 13, 2021 08:53
def _e_step(self, X, pi, mu, sigma):
"""Performs E-step on GMM model
Parameters:
------------
X: (N x d), data points, m: no of features
pi: (C), weights of mixture components
mu: (C x d), mixture component means
sigma: (C x d x d), mixture component covariance matrices
def _initialise_parameters(self, X):
"""Implement k-means to find starting
parameter values.
Parameters:
------------
X: numpy array of data points
Returns:
----------
class GMM:
""" Gaussian Mixture Model
Parameters
-----------
k: int , number of gaussian distributions
seed: int, will be randomly set if None
max_iter: int, number of iterations to run algorithm, default: 200
model = GMM(3, n_runs = 30)
fitted_values = model.fit(Y_sklearn)
predicted_values = model.predict(Y_sklearn)
# # compute centers as point of highest density of distribution
centers = np.zeros((3,2))
for i in range(model.C):
density = mvn(cov=model.sigma[i], mean=model.mu[i]).logpdf(Y_sklearn)
centers[i, :] = Y_sklearn[np.argmax(density)]
def calculate_mean_covariance(X, prediction, C):
d = X.shape[1]
labels = np.unique(prediction)
initial_means = np.zeros((C, d))
initial_cov = np.zeros((C, d, d))
initial_pi = np.zeros(C)
counter=0
for label in sorted(labels):
ids = np.where(prediction == label) # returns indices
plt.figure(figsize = (10,8))
from scipy.spatial.distance import cdist
def plot_kmeans(kmeans, X, n_clusters=3, rseed=0, ax=None):
labels = kmeans.fit_predict(X)
# plot the input data
ax = ax or plt.gca()
ax.axis('equal')
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
from sklearn.mixture import GaussianMixture
sklearn_pca = PCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform(tf_idf_array)
gmm = GaussianMixture(n_components=3, covariance_type='full').fit(Y_sklearn)
prediction_gmm = gmm.predict(Y_sklearn)
probs = gmm.predict_proba(Y_sklearn)
centers = np.zeros((3,2))
for i in range(3):
density = mvn(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(Y_sklearn)
class GMM:
""" Gaussian Mixture Model
Parameters
-----------
k: int , number of gaussian distributions
seed: int, will be randomly set if None
max_iter: int, number of iterations to run algorithm, default: 200
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
labels = np.unique(prediction)
dfs = []
for label in labels:
id_temp = np.where(prediction==label) # indices for each cluster
x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
features = tf_idf_vectorizor.get_feature_names()
best_features = [(features[i], x_means[i]) for i in sorted_means]
df = pd.DataFrame(best_features, columns = ['features', 'score'])