Skip to content

Instantly share code, notes, and snippets.

@florianherrengt
Last active April 12, 2017 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save florianherrengt/a5d773c58fd1edde634eacf41ac0878d to your computer and use it in GitHub Desktop.
Save florianherrengt/a5d773c58fd1edde634eacf41ac0878d to your computer and use it in GitHub Desktop.
Predict posts topic from BBC dataset
# BBC Dataset: http://mlg.ucd.ie/datasets/bbc.html
import os
import glob
import sys
import nltk
import numpy as np
import scipy as sp
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
SEED = 2017
K = 10
# remove morphological affixes from words
# e.g generously -> generous
# http://www.nltk.org/howto/stem.html
english_stemmer = nltk.stem.SnowballStemmer('english')
# extends Tfidf with the stemmer
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
# build workable dataset
# the dataset contains 5 topics in directories
# add the text to X and the topic to y
X = []
y = []
for topic in ['business', 'entertainment', 'politics', 'sport', 'tech']:
for file in glob.glob(os.path.join('bbc', topic, "*")):
try:
X.append(
open(file, encoding='utf8').read()
)
y.append(topic)
except ValueError as e:
# some text files can't be decoded
# 'utf-8' codec can't decode
print(file, file=sys.stderr)
print(e)
pass
# sanity check: make sure we have the same amount of text than topics
# If the lenghts of both arrays differ, an AssertionError will be raised
assert len(X) == len(y), 'len(X) is not len(y)'
# Generate a test set containing 20% of the documents (training set will contain the remaining 80%).
# Ensure the class (topic) distribution is equivalent for both sets
# Set a SEED to ensure reproducible results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
class Classifier(KMeans):
def __init__(self, n_clusters, init, n_init, verbose):
super(Classifier, self).__init__(
n_clusters=n_clusters,
init=init,
n_init=n_init,
verbose=verbose
)
self.vectorizer = StemmedTfidfVectorizer(min_df=1, max_df=0.5, stop_words='english',
ngram_range=(1, 2), lowercase=True)
def fit(self, X, y=None):
self.vectorized = self.vectorizer.fit_transform(X)
super().fit(self.vectorized)
return self
def predict(self, X):
# create a vector from the text
new_post_vec = self.vectorizer.transform([X_test])
new_post_label = super().predict(new_post_vec)[0]
similar_indices = (self.labels_ == new_post_label).nonzero()[0]
# loop through all the other vectors and find the most similar article
similar = []
# similar_topics = []
for i in similar_indices:
dist = sp.linalg.norm((new_post_vec - self.vectorized[i]).toarray())
similar.append((i, dist, self.X[i]))
# similar_topics.append(self.y[i])
similar = sorted(similar, key=lambda x: x[1]) # sort by minimum distance
# return the topic of the most similar
# maybe this should be based on the amount of similar article ??
# e.g the closer post might belong to tech but with only 2 similar posts but has 12 similar posts in business
# return max(set(similar_topics), key=similar_topics.count)
return self.y[similar[0][0]] # return the topic of the closest article
def score(self, X, y):
# sanity check again
assert len(X) == len(y), 'len(X) is not len(y)'
# compare the result with the expected labels
# results = np.array([self.predict(new_post) for new_post in X]) == np.array(y) # [True, False, ...]
# return sum(results) / len(y)
predictions = [self.predict(new_post) for new_post in X]
assert len(predictions) == len(y)
return accuracy_score(y, predictions)
clf = Classifier(n_clusters=5, init='random', n_init=1, verbose=1)
# clf.fit(X_train[:100], y_train[:100])
# print('Score: ', clf.score(X_test[:100], y_test[:100])) # Score: 0.86
# predicted = cross_val_score(clf, X_train + X_test, y_train + y_test, cv=5, n_jobs=multiprocessing.cpu_count(), verbose=1)
stratified_cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)
predicted = cross_val_score(clf, X, y, cv=stratified_cv.split(X, y), scoring='accuracy', n_jobs=-1, verbose=1)
print('{}-fold CV Accuracy: {}'.format(K, predicted))
print('Average CV Accuracy: {}'.format(np.mean(predicted)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment