Skip to content

Instantly share code, notes, and snippets.

@cydal
Created March 18, 2021 11:54
Show Gist options
  • Save cydal/36be997c608cc59b8c761f16fbd36aea to your computer and use it in GitHub Desktop.
Save cydal/36be997c608cc59b8c761f16fbd36aea to your computer and use it in GitHub Desktop.
## Get Article Title
def get_title(topic_num):
idxs = np.where(km.labels_ == topic_num)[0]
titles = [x_train.iloc[idx]["Title"] for idx in idxs]
return(titles)
# Print Top 10 words for each cluster
for i, x in enumerate(lsa.inverse_transform(km.cluster_centers_).argsort()[:, ::-1][:, :10]):
words = [tfidf_vectorizer.get_feature_names()[n] for n in x]
print("Topic Words -- ", i)
print(', '.join(words))
print("Topic Title -- ", i)
print(', '.join(get_title(i)))
toremove_list = [7, 9, 13, 15, 35, 36, 5, 6, 17, 18, 19, 21, 24, 25, 26, 27, 30, 31,
32, 33, 34, 36, 38, 40, 41, 42, 44, 46, 47, 49, 50, 51, 52, 54, 55, 56,
57, 58, 59]
## Titles within within particular cluster
df.iloc[np.where(km.labels_ == 5)[0], :]["Title"]
## Get Article Text within cluster
df.iloc[np.where(km.labels_ == 66)[0], :]["cleaned_text"][15]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment