Skip to content

Instantly share code, notes, and snippets.

Last active May 28, 2016
What would you like to do?
MajorClust algorithm implementation using sklearn based on SO conversation about text clustering using python(
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from itertools import combinations
from random import shuffle
def majorclust_sklearn():
texts = [
"foo blub baz",
"foo bar baz",
"asdf bsdf csdf",
"foo bab blub",
"csdf hddf kjtz",
"123 456 890",
"321 890 456 foo",
"123 890 uiop",
vectorizer = TfidfVectorizer()
corpus_mat = vectorizer.fit_transform(texts)
num_of_samples, num_of_features = corpus_mat.shape
cosine_distances = np.zeros((num_of_samples, num_of_samples))
for i in range(len(texts)):
cosine_distances[i] = linear_kernel(corpus_mat[i:i+1], corpus_mat).flatten()
cosine_distances[i, i] = 0
t = False
indices = np.arange(num_of_samples)
while not t:
t = True
shuffled_indices = np.arange(num_of_samples)
for index in shuffled_indices:
# aggregating edge weights
new_index = np.argmax(np.bincount(indices,
if indices[new_index] != indices[index]:
indices[index] = indices[new_index]
t = False
clusters = {}
for index, target in enumerate(indices):
clusters.setdefault(target, []).append(texts[index])
for cluster in clusters:
return clusters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment