kajal yadav techykajal

## read_Grams.py
def readFile(fileName):
    """
    This function will read the text files passed & return the list
    """
    fileObj = open(fileName, "r") #opens the file in read mode
    words = fileObj.read().splitlines() #puts the file into a list
    fileObj.close()
    return words

## read_dataset.py
# Input pre-processed text from Objective 1 of action learning plan
# write code to read data
DF = pd.read_csv('Cleaned_Data_With_StopWords.csv')
DF.head()
DF['Content_nGrams'] = DF['Processed_Content']
Processed_Content = DF['Content_nGrams']
DF.head()

## cluuster_fig.py
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][0]) # Get top 25 rows of 1st Dataframe

## Top_features_K_means.py
def get_top_features_cluster(X_std, prediction, n_feats):
    # Get unique labels, in this case {0,1}
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # Get indices for each feature corresponding to each cluster.
        x_means = np.mean(X_std[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = n_grams_to_use
        best_features = [(features[i], x_means[i]) for i in sorted_means] # Retrieve corresponding best features to that of best scores.

## K_Means_result.py
def kmeans_clustering(Y_sklearn, fitted):
    """
    This function will predict clusters on training set and plot the visuals of clusters as well.
    """

    plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis') # Plotting scatter plot
    centers2 = fitted.cluster_centers_ # It will give best possible coordinates of cluster center after fitting k-means
    plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);
    # As this can be seen from the figure, there is an outlier as well.
kmeans_clustering(Y_sklearn, fitted)

## elbow_method_value_K.py
def elbow_method(Y_sklearn):
    """
    This is the function used to get optimal number of clusters in order to feed to the k-means clustering algorithm.
    """

    number_clusters = range(1, 7)  # Range of possible clusters that can be generated
    kmeans = [KMeans(n_clusters=i, max_iter = 600) for i in number_clusters] # Getting no. of clusters

    score = [kmeans[i].fit(Y_sklearn).score(Y_sklearn) for i in range(len(kmeans))] # Getting score corresponding to each cluster.
    score = [i*-1 for i in score] # Getting list of positive scores.

## K_Measn.py
sklearn_pca = PCA(n_components = 2) # Using PCA to remove cols which has less co-relation
Y_sklearn = sklearn_pca.fit_transform(X_std) #fit_transform() is used to scale training data to learn parameters such as
# mean & variance of the features of training set and then these parameters are used to scale our testing data.
# As concluded using Elbow Method.
n_clusters = 2
kmeans = KMeans(n_clusters= n_clusters, max_iter=400, algorithm = 'auto')# Partition 'n' no. of observations into 'k' no. of clusters.
fitted = kmeans.fit(Y_sklearn) # Fitting k-means model  to feature array
prediction = kmeans.predict(Y_sklearn) # predicting clusters class '0' or '1' corresponding to 'n' no. of observations

## diff_distributions.py
def df_normal_dist():
    x = np.arange(-10, 10, 0.001)
    dp = norm.pdf(x, df_mean, df_std)
    dq = norm.pdf(x, 0, 1)
    # Taking KL divergence b/w Dataframe distribution & normal distribution.
    plt.title('KL(P||Q) = %1.3f' % kl_divergence(dp, dq))
    plt.plot(x, dp)
    plt.plot(x, dq, c='red')
df_normal_dist()

## KL_divergence.py
df_variance = df.var() # Calculating variance for each feature of dataframe
df_mean = df.stack().mean() # Calculating mean for each feature of dataframe
trans_mean = Transposed_Dataset.stack().mean() # Calculating mean for each feature of transpose
trans_variance = Transposed_Dataset.var() # Calculating variance for each feature of transpose
df_std = df.stack().std()   # Calculating standard deviation for each feature of dataframe
trans_std = Transposed_Dataset.stack().std() # Calculating standard deviation for each feature of transpose

def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

## transpose_dataset.py
Transposed_Dataset = df.T
Transposed_Dataset.head()
	def readFile(fileName):
	"""
	This function will read the text files passed & return the list
	"""
	fileObj = open(fileName, "r") #opens the file in read mode
	words = fileObj.read().splitlines() #puts the file into a list
	fileObj.close()
	return words
	# Input pre-processed text from Objective 1 of action learning plan
	# write code to read data
	DF = pd.read_csv('Cleaned_Data_With_StopWords.csv')
	DF.head()
	DF['Content_nGrams'] = DF['Processed_Content']
	Processed_Content = DF['Content_nGrams']
	DF.head()
	plt.figure(figsize=(8,6))
	sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][0]) # Get top 25 rows of 1st Dataframe
	def get_top_features_cluster(X_std, prediction, n_feats):
	# Get unique labels, in this case {0,1}
	labels = np.unique(prediction)
	dfs = []
	for label in labels:
	id_temp = np.where(prediction==label) # Get indices for each feature corresponding to each cluster.
	x_means = np.mean(X_std[id_temp], axis = 0) # returns average score across cluster
	sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
	features = n_grams_to_use
	best_features = [(features[i], x_means[i]) for i in sorted_means] # Retrieve corresponding best features to that of best scores.
	def kmeans_clustering(Y_sklearn, fitted):
	"""
	This function will predict clusters on training set and plot the visuals of clusters as well.
	"""

	plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis') # Plotting scatter plot
	centers2 = fitted.cluster_centers_ # It will give best possible coordinates of cluster center after fitting k-means
	plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);
	# As this can be seen from the figure, there is an outlier as well.
	kmeans_clustering(Y_sklearn, fitted)
	def elbow_method(Y_sklearn):
	"""
	This is the function used to get optimal number of clusters in order to feed to the k-means clustering algorithm.
	"""

	number_clusters = range(1, 7) # Range of possible clusters that can be generated
	kmeans = [KMeans(n_clusters=i, max_iter = 600) for i in number_clusters] # Getting no. of clusters

	score = [kmeans[i].fit(Y_sklearn).score(Y_sklearn) for i in range(len(kmeans))] # Getting score corresponding to each cluster.
	score = [i*-1 for i in score] # Getting list of positive scores.
	sklearn_pca = PCA(n_components = 2) # Using PCA to remove cols which has less co-relation
	Y_sklearn = sklearn_pca.fit_transform(X_std) #fit_transform() is used to scale training data to learn parameters such as
	# mean & variance of the features of training set and then these parameters are used to scale our testing data.
	# As concluded using Elbow Method.
	n_clusters = 2
	kmeans = KMeans(n_clusters= n_clusters, max_iter=400, algorithm = 'auto')# Partition 'n' no. of observations into 'k' no. of clusters.
	fitted = kmeans.fit(Y_sklearn) # Fitting k-means model to feature array
	prediction = kmeans.predict(Y_sklearn) # predicting clusters class '0' or '1' corresponding to 'n' no. of observations
	def df_normal_dist():
	x = np.arange(-10, 10, 0.001)
	dp = norm.pdf(x, df_mean, df_std)
	dq = norm.pdf(x, 0, 1)
	# Taking KL divergence b/w Dataframe distribution & normal distribution.
	plt.title('KL(P\|\|Q) = %1.3f' % kl_divergence(dp, dq))
	plt.plot(x, dp)
	plt.plot(x, dq, c='red')
	df_normal_dist()
	df_variance = df.var() # Calculating variance for each feature of dataframe
	df_mean = df.stack().mean() # Calculating mean for each feature of dataframe
	trans_mean = Transposed_Dataset.stack().mean() # Calculating mean for each feature of transpose
	trans_variance = Transposed_Dataset.var() # Calculating variance for each feature of transpose
	df_std = df.stack().std() # Calculating standard deviation for each feature of dataframe
	trans_std = Transposed_Dataset.stack().std() # Calculating standard deviation for each feature of transpose

	def kl_divergence(p, q):
	return np.sum(np.where(p != 0, p * np.log(p / q), 0))