Skip to content

Instantly share code, notes, and snippets.

View techykajal's full-sized avatar
🎯
Focusing

kajal yadav techykajal

🎯
Focusing
View GitHub Profile
def readFile(fileName):
"""
This function will read the text files passed & return the list
"""
fileObj = open(fileName, "r") #opens the file in read mode
words = fileObj.read().splitlines() #puts the file into a list
fileObj.close()
return words
# Input pre-processed text from Objective 1 of action learning plan
# write code to read data
DF = pd.read_csv('Cleaned_Data_With_StopWords.csv')
DF.head()
DF['Content_nGrams'] = DF['Processed_Content']
Processed_Content = DF['Content_nGrams']
DF.head()
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][0]) # Get top 25 rows of 1st Dataframe
def get_top_features_cluster(X_std, prediction, n_feats):
# Get unique labels, in this case {0,1}
labels = np.unique(prediction)
dfs = []
for label in labels:
id_temp = np.where(prediction==label) # Get indices for each feature corresponding to each cluster.
x_means = np.mean(X_std[id_temp], axis = 0) # returns average score across cluster
sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
features = n_grams_to_use
best_features = [(features[i], x_means[i]) for i in sorted_means] # Retrieve corresponding best features to that of best scores.
def kmeans_clustering(Y_sklearn, fitted):
"""
This function will predict clusters on training set and plot the visuals of clusters as well.
"""
plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis') # Plotting scatter plot
centers2 = fitted.cluster_centers_ # It will give best possible coordinates of cluster center after fitting k-means
plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);
# As this can be seen from the figure, there is an outlier as well.
kmeans_clustering(Y_sklearn, fitted)
def elbow_method(Y_sklearn):
"""
This is the function used to get optimal number of clusters in order to feed to the k-means clustering algorithm.
"""
number_clusters = range(1, 7) # Range of possible clusters that can be generated
kmeans = [KMeans(n_clusters=i, max_iter = 600) for i in number_clusters] # Getting no. of clusters
score = [kmeans[i].fit(Y_sklearn).score(Y_sklearn) for i in range(len(kmeans))] # Getting score corresponding to each cluster.
score = [i*-1 for i in score] # Getting list of positive scores.
sklearn_pca = PCA(n_components = 2) # Using PCA to remove cols which has less co-relation
Y_sklearn = sklearn_pca.fit_transform(X_std) #fit_transform() is used to scale training data to learn parameters such as
# mean & variance of the features of training set and then these parameters are used to scale our testing data.
# As concluded using Elbow Method.
n_clusters = 2
kmeans = KMeans(n_clusters= n_clusters, max_iter=400, algorithm = 'auto')# Partition 'n' no. of observations into 'k' no. of clusters.
fitted = kmeans.fit(Y_sklearn) # Fitting k-means model to feature array
prediction = kmeans.predict(Y_sklearn) # predicting clusters class '0' or '1' corresponding to 'n' no. of observations
def df_normal_dist():
x = np.arange(-10, 10, 0.001)
dp = norm.pdf(x, df_mean, df_std)
dq = norm.pdf(x, 0, 1)
# Taking KL divergence b/w Dataframe distribution & normal distribution.
plt.title('KL(P||Q) = %1.3f' % kl_divergence(dp, dq))
plt.plot(x, dp)
plt.plot(x, dq, c='red')
df_normal_dist()
df_variance = df.var() # Calculating variance for each feature of dataframe
df_mean = df.stack().mean() # Calculating mean for each feature of dataframe
trans_mean = Transposed_Dataset.stack().mean() # Calculating mean for each feature of transpose
trans_variance = Transposed_Dataset.var() # Calculating variance for each feature of transpose
df_std = df.stack().std() # Calculating standard deviation for each feature of dataframe
trans_std = Transposed_Dataset.stack().std() # Calculating standard deviation for each feature of transpose
def kl_divergence(p, q):
return np.sum(np.where(p != 0, p * np.log(p / q), 0))
Transposed_Dataset = df.T
Transposed_Dataset.head()