MajorClust algorithm implementation using sklearn based on SO conversation about text clustering using python(
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from itertools import combinations
from random import shuffle
def majorclust_sklearn():
texts = [
"foo blub baz",
"foo bar baz",
"asdf bsdf csdf",
"foo bab blub",
"csdf hddf kjtz",
"123 456 890",
"321 890 456 foo",
"123 890 uiop",
vectorizer = TfidfVectorizer()
corpus_mat = vectorizer.fit_transform(texts)
num_of_samples, num_of_features = corpus_mat.shape
cosine_distances = np.zeros((num_of_samples, num_of_samples))
for i in range(len(texts)):
cosine_distances[i] = linear_kernel(corpus_mat[i:i+1], corpus_mat).flatten()
cosine_distances[i, i] = 0
t = False
indices = np.arange(num_of_samples)
while not t:
t = True
shuffled_indices = np.arange(num_of_samples)
for index in shuffled_indices:
# aggregating edge weights
new_index = np.argmax(np.bincount(indices,
if indices[new_index] != indices[index]:
indices[index] = indices[new_index]
t = False
clusters = {}
for index, target in enumerate(indices):
clusters.setdefault(target, []).append(texts[index])
for cluster in clusters:
return clusters
