florianherrengt/cluster.py

## cluster.py
# BBC Dataset: http://mlg.ucd.ie/datasets/bbc.html

import os
import glob
import sys

import nltk
import numpy as np
import scipy as sp
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split

SEED = 2017
K = 10

# remove morphological affixes from words
# e.g generously -> generous
# http://www.nltk.org/howto/stem.html
english_stemmer = nltk.stem.SnowballStemmer('english')


# extends Tfidf with the stemmer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))


# build workable dataset
# the dataset contains 5 topics in directories
# add the text to X and the topic to y
X = []
y = []
for topic in ['business', 'entertainment', 'politics', 'sport', 'tech']:
    for file in glob.glob(os.path.join('bbc', topic, "*")):
        try:
            X.append(
                open(file, encoding='utf8').read()
            )
            y.append(topic)
        except ValueError as e:
            # some text files can't be decoded
            # 'utf-8' codec can't decode
            print(file, file=sys.stderr)
            print(e)
            pass

# sanity check: make sure we have the same amount of text than topics
# If the lenghts of both arrays differ, an AssertionError will be raised
assert len(X) == len(y), 'len(X) is not len(y)'

# Generate a test set containing 20% of the documents (training set will contain the remaining 80%).
# Ensure the class (topic) distribution is equivalent for both sets
# Set a SEED to ensure reproducible results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)


class Classifier(KMeans):
    def __init__(self, n_clusters, init, n_init, verbose):
        super(Classifier, self).__init__(
            n_clusters=n_clusters,
            init=init,
            n_init=n_init,
            verbose=verbose
        )
        self.vectorizer = StemmedTfidfVectorizer(min_df=1, max_df=0.5, stop_words='english',
                                                 ngram_range=(1, 2), lowercase=True)

    def fit(self, X, y=None):
        self.vectorized = self.vectorizer.fit_transform(X)
        super().fit(self.vectorized)
        return self

    def predict(self, X):
        # create a vector from the text
        new_post_vec = self.vectorizer.transform([X_test])
        new_post_label = super().predict(new_post_vec)[0]

        similar_indices = (self.labels_ == new_post_label).nonzero()[0]

        # loop through all the other vectors and find the most similar article
        similar = []
        # similar_topics = []
        for i in similar_indices:
            dist = sp.linalg.norm((new_post_vec - self.vectorized[i]).toarray())
            similar.append((i, dist, self.X[i]))
            # similar_topics.append(self.y[i])
        similar = sorted(similar, key=lambda x: x[1])  # sort by minimum distance
        # return the topic of the most similar
        # maybe this should be based on the amount of similar article ??
        # e.g the closer post might belong to tech but with only 2 similar posts but has 12 similar posts in business
        # return max(set(similar_topics), key=similar_topics.count)
        return self.y[similar[0][0]]  # return the topic of the closest article

    def score(self, X, y):
        # sanity check again
        assert len(X) == len(y), 'len(X) is not len(y)'
        # compare the result with the expected labels
        # results = np.array([self.predict(new_post) for new_post in X]) == np.array(y)  # [True, False, ...]
        # return sum(results) / len(y)
        predictions = [self.predict(new_post) for new_post in X]
        assert len(predictions) == len(y)
        return accuracy_score(y, predictions)


clf = Classifier(n_clusters=5, init='random', n_init=1, verbose=1)
# clf.fit(X_train[:100], y_train[:100])
# print('Score: ', clf.score(X_test[:100], y_test[:100])) # Score:  0.86

# predicted = cross_val_score(clf, X_train + X_test, y_train + y_test, cv=5, n_jobs=multiprocessing.cpu_count(), verbose=1)
stratified_cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)
predicted = cross_val_score(clf, X, y, cv=stratified_cv.split(X, y), scoring='accuracy', n_jobs=-1, verbose=1)
print('{}-fold CV Accuracy: {}'.format(K, predicted))
print('Average CV Accuracy: {}'.format(np.mean(predicted)))
	# BBC Dataset: http://mlg.ucd.ie/datasets/bbc.html

	import os
	import glob
	import sys

	import nltk
	import numpy as np
	import scipy as sp
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics import accuracy_score
	from sklearn.model_selection import cross_val_score, StratifiedKFold
	from sklearn.model_selection import train_test_split

	SEED = 2017
	K = 10

	# remove morphological affixes from words
	# e.g generously -> generous
	# http://www.nltk.org/howto/stem.html
	english_stemmer = nltk.stem.SnowballStemmer('english')


	# extends Tfidf with the stemmer
	class StemmedTfidfVectorizer(TfidfVectorizer):
	def build_analyzer(self):
	analyzer = super(TfidfVectorizer, self).build_analyzer()
	return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))


	# build workable dataset
	# the dataset contains 5 topics in directories
	# add the text to X and the topic to y
	X = []
	y = []
	for topic in ['business', 'entertainment', 'politics', 'sport', 'tech']:
	for file in glob.glob(os.path.join('bbc', topic, "*")):
	try:
	X.append(
	open(file, encoding='utf8').read()
	)
	y.append(topic)
	except ValueError as e:
	# some text files can't be decoded
	# 'utf-8' codec can't decode
	print(file, file=sys.stderr)
	print(e)
	pass

	# sanity check: make sure we have the same amount of text than topics
	# If the lenghts of both arrays differ, an AssertionError will be raised
	assert len(X) == len(y), 'len(X) is not len(y)'

	# Generate a test set containing 20% of the documents (training set will contain the remaining 80%).
	# Ensure the class (topic) distribution is equivalent for both sets
	# Set a SEED to ensure reproducible results
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)


	class Classifier(KMeans):
	def __init__(self, n_clusters, init, n_init, verbose):
	super(Classifier, self).__init__(
	n_clusters=n_clusters,
	init=init,
	n_init=n_init,
	verbose=verbose
	)
	self.vectorizer = StemmedTfidfVectorizer(min_df=1, max_df=0.5, stop_words='english',
	ngram_range=(1, 2), lowercase=True)

	def fit(self, X, y=None):
	self.vectorized = self.vectorizer.fit_transform(X)
	super().fit(self.vectorized)
	return self

	def predict(self, X):
	# create a vector from the text
	new_post_vec = self.vectorizer.transform([X_test])
	new_post_label = super().predict(new_post_vec)[0]

	similar_indices = (self.labels_ == new_post_label).nonzero()[0]

	# loop through all the other vectors and find the most similar article
	similar = []
	# similar_topics = []
	for i in similar_indices:
	dist = sp.linalg.norm((new_post_vec - self.vectorized[i]).toarray())
	similar.append((i, dist, self.X[i]))
	# similar_topics.append(self.y[i])
	similar = sorted(similar, key=lambda x: x[1]) # sort by minimum distance
	# return the topic of the most similar
	# maybe this should be based on the amount of similar article ??
	# e.g the closer post might belong to tech but with only 2 similar posts but has 12 similar posts in business
	# return max(set(similar_topics), key=similar_topics.count)
	return self.y[similar[0][0]] # return the topic of the closest article

	def score(self, X, y):
	# sanity check again
	assert len(X) == len(y), 'len(X) is not len(y)'
	# compare the result with the expected labels
	# results = np.array([self.predict(new_post) for new_post in X]) == np.array(y) # [True, False, ...]
	# return sum(results) / len(y)
	predictions = [self.predict(new_post) for new_post in X]
	assert len(predictions) == len(y)
	return accuracy_score(y, predictions)


	clf = Classifier(n_clusters=5, init='random', n_init=1, verbose=1)
	# clf.fit(X_train[:100], y_train[:100])
	# print('Score: ', clf.score(X_test[:100], y_test[:100])) # Score: 0.86

	# predicted = cross_val_score(clf, X_train + X_test, y_train + y_test, cv=5, n_jobs=multiprocessing.cpu_count(), verbose=1)
	stratified_cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)
	predicted = cross_val_score(clf, X, y, cv=stratified_cv.split(X, y), scoring='accuracy', n_jobs=-1, verbose=1)
	print('{}-fold CV Accuracy: {}'.format(K, predicted))
	print('Average CV Accuracy: {}'.format(np.mean(predicted)))