urigoren/AgglomerativeClustering_text.py

## AgglomerativeClustering_text.py
import numpy as np
import collections, itertools, string
from scipy.cluster import hierarchy
from scipy.spatial import distance
from sklearn.feature_extraction import text
from editdistance import distance as editdistance

def edit_pdist(toks, normalize=False):
    """Return pairwise editdistance matrix"""
    n = len(toks)
    ret = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i<=j:
                continue
            x,y = toks[i], toks[j]
            v = editdistance(x, y)
            if normalize:
                v/= max(len(x), len(y))
            ret[j, i] = v
            ret[i, j] = v
    return distance.squareform(ret)


def hier_cluster(X, cutoff=0.1, metric='jaccard', bound='average'):
    """Applies hierarchical clustering and return a cluster index per sample"""
    if hasattr(X, "todense"):
        X = X.todense()
    if metric and metric!='precomputed':
        Z = hierarchy.linkage(X, bound, metric=metric)
    else:
        Z = hierarchy.linkage(X, bound)
    C = hierarchy.fcluster(Z, cutoff, criterion="distance")
    return C

def collect_clusters(Y, sentences):
    """Transforms cluster indices into cluster list of all the samples"""
    result = collections.defaultdict(list)
    for c, s in zip(Y, sentences):
        result[c].append(s)
    result = sorted(result.items(), key=lambda t: -len(t[1]))
    result = [(len(v), sorted(v)) for k,v in result]
    return result

def bow_cluster(sentences, cutoff=0.9, min_df=10, max_df=0.05, ngram_range=(1,4), metric='jaccard'):
    """Applies bag-of-words clustering"""
    vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True, ngram_range=ngram_range)
    X = vectorizer.fit_transform(sentences)
    Y = hier_cluster(X, cutoff, metric)
    return collect_clusters(Y, sentences)

def edit_distance_cluster(sentences, cutoff=3, min_df=1, max_df=0.01):
    """Applies edit distance clustering"""
    vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True)
    vectorizer.fit(sentences)
    sentences_tok = [[vectorizer.vocabulary_.get(w,-1) for w in s.split()] for s in sentences]
    D = edit_pdist(sentences_tok)
    Y = hier_cluster(D, cutoff, 'precomputed')
    return collect_clusters(Y, sentences)

def edit_distance_cluster_simple(sentences, cutoff=3):
    """Applies edit distance clustering, without word removal"""
    tr = dict([(ord(c), ' ') for c in string.punctuation] + [(ord(c), c.lower()) for c in string.ascii_uppercase])
    sentences_tok = [list(filter(bool, s.translate(tr).split())) for s in sentences]
    D = edit_pdist(sentences_tok)
    Y = hier_cluster(D, cutoff, 'precomputed', 'single')
    return collect_clusters(Y, sentences)
	import numpy as np
	import collections, itertools, string
	from scipy.cluster import hierarchy
	from scipy.spatial import distance
	from sklearn.feature_extraction import text
	from editdistance import distance as editdistance

	def edit_pdist(toks, normalize=False):
	"""Return pairwise editdistance matrix"""
	n = len(toks)
	ret = np.zeros((n, n))
	for i in range(n):
	for j in range(n):
	if i<=j:
	continue
	x,y = toks[i], toks[j]
	v = editdistance(x, y)
	if normalize:
	v/= max(len(x), len(y))
	ret[j, i] = v
	ret[i, j] = v
	return distance.squareform(ret)


	def hier_cluster(X, cutoff=0.1, metric='jaccard', bound='average'):
	"""Applies hierarchical clustering and return a cluster index per sample"""
	if hasattr(X, "todense"):
	X = X.todense()
	if metric and metric!='precomputed':
	Z = hierarchy.linkage(X, bound, metric=metric)
	else:
	Z = hierarchy.linkage(X, bound)
	C = hierarchy.fcluster(Z, cutoff, criterion="distance")
	return C

	def collect_clusters(Y, sentences):
	"""Transforms cluster indices into cluster list of all the samples"""
	result = collections.defaultdict(list)
	for c, s in zip(Y, sentences):
	result[c].append(s)
	result = sorted(result.items(), key=lambda t: -len(t[1]))
	result = [(len(v), sorted(v)) for k,v in result]
	return result

	def bow_cluster(sentences, cutoff=0.9, min_df=10, max_df=0.05, ngram_range=(1,4), metric='jaccard'):
	"""Applies bag-of-words clustering"""
	vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True, ngram_range=ngram_range)
	X = vectorizer.fit_transform(sentences)
	Y = hier_cluster(X, cutoff, metric)
	return collect_clusters(Y, sentences)

	def edit_distance_cluster(sentences, cutoff=3, min_df=1, max_df=0.01):
	"""Applies edit distance clustering"""
	vectorizer = text.CountVectorizer(min_df=min_df, max_df=max_df, lowercase=True)
	vectorizer.fit(sentences)
	sentences_tok = [[vectorizer.vocabulary_.get(w,-1) for w in s.split()] for s in sentences]
	D = edit_pdist(sentences_tok)
	Y = hier_cluster(D, cutoff, 'precomputed')
	return collect_clusters(Y, sentences)

	def edit_distance_cluster_simple(sentences, cutoff=3):
	"""Applies edit distance clustering, without word removal"""
	tr = dict([(ord(c), ' ') for c in string.punctuation] + [(ord(c), c.lower()) for c in string.ascii_uppercase])
	sentences_tok = [list(filter(bool, s.translate(tr).split())) for s in sentences]
	D = edit_pdist(sentences_tok)
	Y = hier_cluster(D, cutoff, 'precomputed', 'single')
	return collect_clusters(Y, sentences)