Brown clustering
# Brown Clusters | |
# Algorithm 2 taken from (Slide 15): http://aritter.github.io/courses/5525_slides/brown.pdf | |
import numpy as np | |
from collections import defaultdict | |
from scipy.sparse import csr_matrix | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer | |
newsgroups = fetch_20newsgroups(remove=("headers", "footers")) | |
vectorizer = CountVectorizer(ngram_range=(2,2), min_df=1) | |
X = vectorizer.fit_transform(newsgroups.data) | |
bigram_counts = np.asarray(X.sum(axis=0)).ravel() | |
unigram_counts = defaultdict(int) | |
for k in vectorizer.get_feature_names(): | |
idx = vectorizer.vocabulary_[k] | |
c = bigram_counts[idx] | |
k1, k2 = k.split() | |
unigram_counts[k1] += 1 | |
unigram_counts[k2] += 1 | |
unigram2id = {k[0]: i for i,k in enumerate(sorted(unigram_counts.items(), key=lambda x: x[1], reverse=True))} | |
def get_cooccurence(bigram_counts, unigram2id, vectorizer): | |
cooccurence_matrix = defaultdict(int) | |
for k in vectorizer.get_feature_names(): | |
idx = vectorizer.vocabulary_[k] | |
c = bigram_counts[idx] | |
k1, k2 = k.split() | |
k1 = unigram2id[k1] | |
k2 = unigram2id[k2] | |
k1, k2 = sorted([k1, k2]) # Unique key | |
cooccurence_matrix[(k1, k2)] += c | |
data, rows, cols = zip(*((v, k[0], k[1]) for k, v in cooccurence_matrix.items())) | |
vocab_size = len(unigram2id) | |
cooccurence_matrix = csr_matrix((data, (rows, cols)), shape=(vocab_size, vocab_size)) | |
return cooccurence_matrix | |
cooccurence_matrix = get_cooccurence(bigram_counts, unigram2id, vectorizer) | |
cooccurence_matrix.shape | |
## TODO: implement rest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment