Skip to content

Instantly share code, notes, and snippets.

@Garfounkel
Last active July 8, 2020 23:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Garfounkel/3074c6a8b113e8db527740bcc1e86b2c to your computer and use it in GitHub Desktop.
Save Garfounkel/3074c6a8b113e8db527740bcc1e86b2c to your computer and use it in GitHub Desktop.
Nvidia NLP blog clustering
import numpy as np
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = cv.get_feature_names()
clusters_terms = sorted_centroids[:, :100].get()
for i, c1 in enumerate(clusters_terms):
cluster = set(c1)
for j, c2 in enumerate(clusters_terms):
if i == j:
continue
cluster -= set(c2)
cluster = c1[np.isin(c1, list(cluster))][:5]
print(f'Cluster {i}:', ' | '.join(terms[cluster].tolist()))
from cuml.cluster import KMeans
num_clusters = 10
sample_size = 100_000
kmeans_model = KMeans(n_clusters=num_clusters, n_init=1, max_iter=1000)
sample_data = tfidf_matrix[:sample_size].todense()
sample_tweets = tweets[:sample_size].reset_index(drop=True)
kmeans = kmeans_model.fit(sample_data)
kmeans_clusters = kmeans.predict(sample_data)
kmeans_distances = kmeans.transform(sample_data)
import bokeh.plotting as bp
from bokeh.palettes import Turbo256
from bokeh.models import HoverTool
# setup data
step = len(Turbo256) / num_clusters
kmeans_df = DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['tweets'] = sample_tweets
kmeans_df['color'] = [Turbo256[int(i * step)] for i in kmeans_clusters.tolist()]
kmeans_df = kmeans_df.to_pandas()
# setup plot
plot_kmeans = bp.figure(plot_width=700, plot_height=600,
title="KMeans clustering of item description",
tools="pan,wheel_zoom,box_zoom,reset,hover",
x_axis_type=None, y_axis_type=None, min_border=1)
# display plot and tooltips
plot_kmeans.scatter(x='x', y='y', color='color', source=kmeans_df)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"tweets": "@tweets", "cluster":"@cluster" }
bp.show(plot_kmeans)
sample_tweets[kmeans_clusters == 4].to_pandas().sample()
from cuml.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=1000)
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
cv = CountVectorizer(max_features=18000, stop_words='english')
count = cv.fit_transform(tweets)
tf = TfidfTransformer()
tfidf_matrix = tf.fit_transform(count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment