Last active
November 20, 2019 12:25
-
-
Save urigoren/7a35f42a863d049c7555bd9729e699e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import collections, itertools, string | |
from scipy.cluster import hierarchy | |
from scipy.spatial import distance | |
from sklearn.feature_extraction import text | |
from editdistance import distance as editdistance | |
def edit_pdist(toks, normalize=False): | |
"""Return pairwise editdistance matrix""" | |
n = len(toks) | |
ret = np.zeros((n, n)) | |
for i in range(n): | |
for j in range(n): | |
if i<=j: | |
continue | |
x,y = toks[i], toks[j] | |
v = editdistance(x, y) | |
if normalize: | |
v/= max(len(x), len(y)) | |
ret[j, i] = v | |
ret[i, j] = v | |
return distance.squareform(ret) | |
def hier_cluster(X, cutoff=0.1, metric='jaccard', bound='average'): | |
"""Applies hierarchical clustering and return a cluster index per sample""" | |
if hasattr(X, "todense"): | |
X = X.todense() | |
if metric and metric!='precomputed': | |
Z = hierarchy.linkage(X, bound, metric=metric) | |
else: | |
Z = hierarchy.linkage(X, bound) | |
C = hierarchy.fcluster(Z, cutoff, criterion="distance") | |
return C | |
def collect_clusters(Y, sentences): | |
"""Transforms cluster indices into cluster list of all the samples""" | |
result = collections.defaultdict(list) | |
for c, s in zip(Y, sentences): | |
result[c].append(s) | |
result = sorted(result.items(), key=lambda t: -len(t[1])) | |
result = [(len(v), sorted(v)) for k,v in result] | |
return result | |
def bow_cluster(sentences, cutoff=0.9, min_df=10, max_df=0.05, ngram_range=(1,4), metric='jaccard'): | |
"""Applies bag-of-words clustering""" | |
vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True, ngram_range=ngram_range) | |
X = vectorizer.fit_transform(sentences) | |
Y = hier_cluster(X, cutoff, metric) | |
return collect_clusters(Y, sentences) | |
def edit_distance_cluster(sentences, cutoff=3, min_df=1, max_df=0.01): | |
"""Applies edit distance clustering""" | |
vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True) | |
vectorizer.fit(sentences) | |
sentences_tok = [[vectorizer.vocabulary_.get(w,-1) for w in s.split()] for s in sentences] | |
D = edit_pdist(sentences_tok) | |
Y = hier_cluster(D, cutoff, 'precomputed') | |
return collect_clusters(Y, sentences) | |
def edit_distance_cluster_simple(sentences, cutoff=3): | |
"""Applies edit distance clustering, without word removal""" | |
tr = dict([(ord(c), ' ') for c in string.punctuation] + [(ord(c), c.lower()) for c in string.ascii_uppercase]) | |
sentences_tok = [list(filter(bool, s.translate(tr).split())) for s in sentences] | |
D = edit_pdist(sentences_tok) | |
Y = hier_cluster(D, cutoff, 'precomputed', 'single') | |
return collect_clusters(Y, sentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A shortcut for the lazy: