DFoly

## m_step.py
      def _m_step(self, X, gamma):
        """Performs M-step of the GMM
        We need to update our priors, our means
        and our covariance matrix.

        Parameters:
        -----------
        X: (N x d), data
        gamma: (N x C), posterior distribution of lower bound

## e_step.py
      def _e_step(self, X, pi, mu, sigma):
        """Performs E-step on GMM model

        Parameters:
        ------------
        X: (N x d), data points, m: no of features
        pi: (C), weights of mixture components
        mu: (C x d), mixture component means
        sigma: (C x d x d), mixture component covariance matrices

## params.py
   def _initialise_parameters(self, X):
        """Implement k-means to find starting
            parameter values.

        Parameters:
        ------------
        X: numpy array of data points

        Returns:
        ----------

## initial.py
class GMM:
    """ Gaussian Mixture Model

    Parameters
    -----------
        k: int , number of gaussian distributions

        seed: int, will be randomly set if None

        max_iter: int, number of iterations to run algorithm, default: 200

## GMM_scratch.py
model = GMM(3, n_runs = 30)

fitted_values = model.fit(Y_sklearn)
predicted_values = model.predict(Y_sklearn)

# # compute centers as point of highest density of distribution
centers = np.zeros((3,2))
for i in range(model.C):
    density = mvn(cov=model.sigma[i], mean=model.mu[i]).logpdf(Y_sklearn)
    centers[i, :] = Y_sklearn[np.argmax(density)]

## mean_covariance.py
def calculate_mean_covariance(X, prediction, C):
    d = X.shape[1]
    labels = np.unique(prediction)
    initial_means = np.zeros((C, d))
    initial_cov = np.zeros((C, d, d))
    initial_pi = np.zeros(C)

    counter=0
    for label in sorted(labels):
        ids = np.where(prediction == label) # returns indices

## plot_kmeans.py
plt.figure(figsize = (10,8))
from scipy.spatial.distance import cdist
def plot_kmeans(kmeans, X, n_clusters=3, rseed=0, ax=None):
    labels = kmeans.fit_predict(X)

    # plot the input data
    ax = ax or plt.gca()
    ax.axis('equal')
    ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)

## sklearn_gmm.py
from sklearn.mixture import GaussianMixture
sklearn_pca = PCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform(tf_idf_array)
gmm = GaussianMixture(n_components=3, covariance_type='full').fit(Y_sklearn)
prediction_gmm = gmm.predict(Y_sklearn)
probs = gmm.predict_proba(Y_sklearn)

centers = np.zeros((3,2))
for i in range(3):
    density = mvn(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(Y_sklearn)

## gmm_class.py
class GMM:
    """ Gaussian Mixture Model

    Parameters
    -----------
        k: int , number of gaussian distributions

        seed: int, will be randomly set if None

        max_iter: int, number of iterations to run algorithm, default: 200

## top_features.py
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # indices for each cluster
        x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = tf_idf_vectorizor.get_feature_names()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
	def _m_step(self, X, gamma):
	"""Performs M-step of the GMM
	We need to update our priors, our means
	and our covariance matrix.

	Parameters:
	-----------
	X: (N x d), data
	gamma: (N x C), posterior distribution of lower bound
	def _e_step(self, X, pi, mu, sigma):
	"""Performs E-step on GMM model

	Parameters:
	------------
	X: (N x d), data points, m: no of features
	pi: (C), weights of mixture components
	mu: (C x d), mixture component means
	sigma: (C x d x d), mixture component covariance matrices
	def _initialise_parameters(self, X):
	"""Implement k-means to find starting
	parameter values.

	Parameters:
	------------
	X: numpy array of data points

	Returns:
	----------
	class GMM:
	""" Gaussian Mixture Model

	Parameters
	-----------
	k: int , number of gaussian distributions

	seed: int, will be randomly set if None

	max_iter: int, number of iterations to run algorithm, default: 200
	model = GMM(3, n_runs = 30)

	fitted_values = model.fit(Y_sklearn)
	predicted_values = model.predict(Y_sklearn)

	# # compute centers as point of highest density of distribution
	centers = np.zeros((3,2))
	for i in range(model.C):
	density = mvn(cov=model.sigma[i], mean=model.mu[i]).logpdf(Y_sklearn)
	centers[i, :] = Y_sklearn[np.argmax(density)]
	def calculate_mean_covariance(X, prediction, C):
	d = X.shape[1]
	labels = np.unique(prediction)
	initial_means = np.zeros((C, d))
	initial_cov = np.zeros((C, d, d))
	initial_pi = np.zeros(C)

	counter=0
	for label in sorted(labels):
	ids = np.where(prediction == label) # returns indices
	plt.figure(figsize = (10,8))
	from scipy.spatial.distance import cdist
	def plot_kmeans(kmeans, X, n_clusters=3, rseed=0, ax=None):
	labels = kmeans.fit_predict(X)

	# plot the input data
	ax = ax or plt.gca()
	ax.axis('equal')
	ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
	from sklearn.mixture import GaussianMixture
	sklearn_pca = PCA(n_components = 2)
	Y_sklearn = sklearn_pca.fit_transform(tf_idf_array)
	gmm = GaussianMixture(n_components=3, covariance_type='full').fit(Y_sklearn)
	prediction_gmm = gmm.predict(Y_sklearn)
	probs = gmm.predict_proba(Y_sklearn)

	centers = np.zeros((3,2))
	for i in range(3):
	density = mvn(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(Y_sklearn)
	def get_top_features_cluster(tf_idf_array, prediction, n_feats):
	labels = np.unique(prediction)
	dfs = []
	for label in labels:
	id_temp = np.where(prediction==label) # indices for each cluster
	x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
	sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
	features = tf_idf_vectorizor.get_feature_names()
	best_features = [(features[i], x_means[i]) for i in sorted_means]
	df = pd.DataFrame(best_features, columns = ['features', 'score'])