This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _m_step(self, X, gamma): | |
"""Performs M-step of the GMM | |
We need to update our priors, our means | |
and our covariance matrix. | |
Parameters: | |
----------- | |
X: (N x d), data | |
gamma: (N x C), posterior distribution of lower bound |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _e_step(self, X, pi, mu, sigma): | |
"""Performs E-step on GMM model | |
Parameters: | |
------------ | |
X: (N x d), data points, m: no of features | |
pi: (C), weights of mixture components | |
mu: (C x d), mixture component means | |
sigma: (C x d x d), mixture component covariance matrices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _initialise_parameters(self, X): | |
"""Implement k-means to find starting | |
parameter values. | |
Parameters: | |
------------ | |
X: numpy array of data points | |
Returns: | |
---------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class GMM: | |
""" Gaussian Mixture Model | |
Parameters | |
----------- | |
k: int , number of gaussian distributions | |
seed: int, will be randomly set if None | |
max_iter: int, number of iterations to run algorithm, default: 200 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = GMM(3, n_runs = 30) | |
fitted_values = model.fit(Y_sklearn) | |
predicted_values = model.predict(Y_sklearn) | |
# # compute centers as point of highest density of distribution | |
centers = np.zeros((3,2)) | |
for i in range(model.C): | |
density = mvn(cov=model.sigma[i], mean=model.mu[i]).logpdf(Y_sklearn) | |
centers[i, :] = Y_sklearn[np.argmax(density)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def calculate_mean_covariance(X, prediction, C): | |
d = X.shape[1] | |
labels = np.unique(prediction) | |
initial_means = np.zeros((C, d)) | |
initial_cov = np.zeros((C, d, d)) | |
initial_pi = np.zeros(C) | |
counter=0 | |
for label in sorted(labels): | |
ids = np.where(prediction == label) # returns indices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize = (10,8)) | |
from scipy.spatial.distance import cdist | |
def plot_kmeans(kmeans, X, n_clusters=3, rseed=0, ax=None): | |
labels = kmeans.fit_predict(X) | |
# plot the input data | |
ax = ax or plt.gca() | |
ax.axis('equal') | |
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.mixture import GaussianMixture | |
sklearn_pca = PCA(n_components = 2) | |
Y_sklearn = sklearn_pca.fit_transform(tf_idf_array) | |
gmm = GaussianMixture(n_components=3, covariance_type='full').fit(Y_sklearn) | |
prediction_gmm = gmm.predict(Y_sklearn) | |
probs = gmm.predict_proba(Y_sklearn) | |
centers = np.zeros((3,2)) | |
for i in range(3): | |
density = mvn(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(Y_sklearn) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class GMM: | |
""" Gaussian Mixture Model | |
Parameters | |
----------- | |
k: int , number of gaussian distributions | |
seed: int, will be randomly set if None | |
max_iter: int, number of iterations to run algorithm, default: 200 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_top_features_cluster(tf_idf_array, prediction, n_feats): | |
labels = np.unique(prediction) | |
dfs = [] | |
for label in labels: | |
id_temp = np.where(prediction==label) # indices for each cluster | |
x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster | |
sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores | |
features = tf_idf_vectorizor.get_feature_names() | |
best_features = [(features[i], x_means[i]) for i in sorted_means] | |
df = pd.DataFrame(best_features, columns = ['features', 'score']) |