Skip to content

Instantly share code, notes, and snippets.

@baali
Last active May 28, 2016 05:16
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save baali/7983261 to your computer and use it in GitHub Desktop.
Save baali/7983261 to your computer and use it in GitHub Desktop.
MajorClust algorithm implementation using sklearn based on SO conversation about text clustering using python(http://stackoverflow.com/questions/1789254/clustering-text-in-python).
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from itertools import combinations
from random import shuffle
def majorclust_sklearn():
texts = [
"foo blub baz",
"foo bar baz",
"asdf bsdf csdf",
"foo bab blub",
"csdf hddf kjtz",
"123 456 890",
"321 890 456 foo",
"123 890 uiop",
]
vectorizer = TfidfVectorizer()
corpus_mat = vectorizer.fit_transform(texts)
num_of_samples, num_of_features = corpus_mat.shape
cosine_distances = np.zeros((num_of_samples, num_of_samples))
for i in range(len(texts)):
cosine_distances[i] = linear_kernel(corpus_mat[i:i+1], corpus_mat).flatten()
cosine_distances[i, i] = 0
t = False
indices = np.arange(num_of_samples)
while not t:
t = True
shuffled_indices = np.arange(num_of_samples)
shuffle(shuffled_indices)
for index in shuffled_indices:
# aggregating edge weights
new_index = np.argmax(np.bincount(indices,
weights=cosine_distances[index]))
if indices[new_index] != indices[index]:
indices[index] = indices[new_index]
t = False
clusters = {}
for index, target in enumerate(indices):
clusters.setdefault(target, []).append(texts[index])
for cluster in clusters:
print(80*"=")
print("\n".join(clusters[cluster]))
return clusters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment