cydal/remove_clusters.py

## remove_clusters.py
## Get Article Title
def get_title(topic_num):
  idxs = np.where(km.labels_ == topic_num)[0]
  titles = [x_train.iloc[idx]["Title"] for idx in idxs]

  return(titles)

# Print Top 10 words for each cluster
for i, x in enumerate(lsa.inverse_transform(km.cluster_centers_).argsort()[:, ::-1][:, :10]):
  words = [tfidf_vectorizer.get_feature_names()[n] for n in x]
  print("Topic Words -- ", i)
  print(', '.join(words))

  print("Topic Title -- ", i)
  print(', '.join(get_title(i)))


toremove_list = [7, 9, 13, 15, 35, 36, 5, 6, 17, 18, 19, 21, 24, 25, 26, 27, 30, 31,
                 32, 33, 34, 36, 38, 40, 41, 42, 44, 46, 47, 49, 50, 51, 52, 54, 55, 56,
                 57, 58, 59]

## Titles within within particular cluster
df.iloc[np.where(km.labels_ == 5)[0], :]["Title"]

## Get Article Text within cluster
df.iloc[np.where(km.labels_ == 66)[0], :]["cleaned_text"][15]
	## Get Article Title
	def get_title(topic_num):
	idxs = np.where(km.labels_ == topic_num)[0]
	titles = [x_train.iloc[idx]["Title"] for idx in idxs]

	return(titles)

	# Print Top 10 words for each cluster
	for i, x in enumerate(lsa.inverse_transform(km.cluster_centers_).argsort()[:, ::-1][:, :10]):
	words = [tfidf_vectorizer.get_feature_names()[n] for n in x]
	print("Topic Words -- ", i)
	print(', '.join(words))

	print("Topic Title -- ", i)
	print(', '.join(get_title(i)))


	toremove_list = [7, 9, 13, 15, 35, 36, 5, 6, 17, 18, 19, 21, 24, 25, 26, 27, 30, 31,
	32, 33, 34, 36, 38, 40, 41, 42, 44, 46, 47, 49, 50, 51, 52, 54, 55, 56,
	57, 58, 59]

	## Titles within within particular cluster
	df.iloc[np.where(km.labels_ == 5)[0], :]["Title"]

	## Get Article Text within cluster
	df.iloc[np.where(km.labels_ == 66)[0], :]["cleaned_text"][15]