Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lppier/845ca0a0bb493ec891b3662a190501f6 to your computer and use it in GitHub Desktop.
Save lppier/845ca0a0bb493ec891b3662a190501f6 to your computer and use it in GitHub Desktop.
Using PCA to represent word vectors in 2D
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
embedder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
#embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus = tfidf_sum.index.to_list()
corpus_embeddings = embedder.encode(corpus)
# Perform kmean clustering
num_clusters = 8
clustering_model = KMeans(n_clusters=num_clusters, random_state=42, init='k-means++')
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])
print("Development Areas Clusters")
for i, cluster in enumerate(clustered_sentences):
print("-- Cluster ", i+1)
print(cluster)
print("")
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
scatter_plot_points = pca.fit_transform(corpus_embeddings)
colors = ["r", "b", "c", "y", "m", "g", "k", "w"]
x_axis = [o[0] for o in scatter_plot_points]
y_axis = [o[1] for o in scatter_plot_points]
fig, ax = plt.subplots(figsize=(20,20))
ax.scatter(x_axis, y_axis, c=[colors[d] for d in cluster_assignment])
for i, txt in enumerate(corpus):
ax.annotate(txt, (x_axis[i], y_axis[i]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment