Skip to content

Instantly share code, notes, and snippets.

@urigoren
Last active November 20, 2019 12:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save urigoren/7a35f42a863d049c7555bd9729e699e4 to your computer and use it in GitHub Desktop.
Save urigoren/7a35f42a863d049c7555bd9729e699e4 to your computer and use it in GitHub Desktop.
import numpy as np
import collections, itertools, string
from scipy.cluster import hierarchy
from scipy.spatial import distance
from sklearn.feature_extraction import text
from editdistance import distance as editdistance
def edit_pdist(toks, normalize=False):
"""Return pairwise editdistance matrix"""
n = len(toks)
ret = np.zeros((n, n))
for i in range(n):
for j in range(n):
if i<=j:
continue
x,y = toks[i], toks[j]
v = editdistance(x, y)
if normalize:
v/= max(len(x), len(y))
ret[j, i] = v
ret[i, j] = v
return distance.squareform(ret)
def hier_cluster(X, cutoff=0.1, metric='jaccard', bound='average'):
"""Applies hierarchical clustering and return a cluster index per sample"""
if hasattr(X, "todense"):
X = X.todense()
if metric and metric!='precomputed':
Z = hierarchy.linkage(X, bound, metric=metric)
else:
Z = hierarchy.linkage(X, bound)
C = hierarchy.fcluster(Z, cutoff, criterion="distance")
return C
def collect_clusters(Y, sentences):
"""Transforms cluster indices into cluster list of all the samples"""
result = collections.defaultdict(list)
for c, s in zip(Y, sentences):
result[c].append(s)
result = sorted(result.items(), key=lambda t: -len(t[1]))
result = [(len(v), sorted(v)) for k,v in result]
return result
def bow_cluster(sentences, cutoff=0.9, min_df=10, max_df=0.05, ngram_range=(1,4), metric='jaccard'):
"""Applies bag-of-words clustering"""
vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True, ngram_range=ngram_range)
X = vectorizer.fit_transform(sentences)
Y = hier_cluster(X, cutoff, metric)
return collect_clusters(Y, sentences)
def edit_distance_cluster(sentences, cutoff=3, min_df=1, max_df=0.01):
"""Applies edit distance clustering"""
vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True)
vectorizer.fit(sentences)
sentences_tok = [[vectorizer.vocabulary_.get(w,-1) for w in s.split()] for s in sentences]
D = edit_pdist(sentences_tok)
Y = hier_cluster(D, cutoff, 'precomputed')
return collect_clusters(Y, sentences)
def edit_distance_cluster_simple(sentences, cutoff=3):
"""Applies edit distance clustering, without word removal"""
tr = dict([(ord(c), ' ') for c in string.punctuation] + [(ord(c), c.lower()) for c in string.ascii_uppercase])
sentences_tok = [list(filter(bool, s.translate(tr).split())) for s in sentences]
D = edit_pdist(sentences_tok)
Y = hier_cluster(D, cutoff, 'precomputed', 'single')
return collect_clusters(Y, sentences)
@urigoren
Copy link
Author

A shortcut for the lazy:

from urllib.request import urlopen
gist = "https://gist.githubusercontent.com/urigoren/7a35f42a863d049c7555bd9729e699e4/raw/78d287042c6f0cb240757b2416b273007457662a/AgglomerativeClustering_text.py"
exec(urlopen(gist).read().decode("utf8"))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment