Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Brown clustering
# Brown Clusters
# Algorithm 2 taken from (Slide 15): http://aritter.github.io/courses/5525_slides/brown.pdf
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups = fetch_20newsgroups(remove=("headers", "footers"))
vectorizer = CountVectorizer(ngram_range=(2,2), min_df=1)
X = vectorizer.fit_transform(newsgroups.data)
bigram_counts = np.asarray(X.sum(axis=0)).ravel()
unigram_counts = defaultdict(int)
for k in vectorizer.get_feature_names():
idx = vectorizer.vocabulary_[k]
c = bigram_counts[idx]
k1, k2 = k.split()
unigram_counts[k1] += 1
unigram_counts[k2] += 1
unigram2id = {k[0]: i for i,k in enumerate(sorted(unigram_counts.items(), key=lambda x: x[1], reverse=True))}
def get_cooccurence(bigram_counts, unigram2id, vectorizer):
cooccurence_matrix = defaultdict(int)
for k in vectorizer.get_feature_names():
idx = vectorizer.vocabulary_[k]
c = bigram_counts[idx]
k1, k2 = k.split()
k1 = unigram2id[k1]
k2 = unigram2id[k2]
k1, k2 = sorted([k1, k2]) # Unique key
cooccurence_matrix[(k1, k2)] += c
data, rows, cols = zip(*((v, k[0], k[1]) for k, v in cooccurence_matrix.items()))
vocab_size = len(unigram2id)
cooccurence_matrix = csr_matrix((data, (rows, cols)), shape=(vocab_size, vocab_size))
return cooccurence_matrix
cooccurence_matrix = get_cooccurence(bigram_counts, unigram2id, vectorizer)
cooccurence_matrix.shape
## TODO: implement rest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment