Last active
June 27, 2016 15:32
-
-
Save i5on9i/862fe1dd7ce83fc0bb91c6da367bc6ba to your computer and use it in GitHub Desktop.
how to use out-of-core ver. naiveBayes of scikit-learn library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
====================================================== | |
Out-of-core classification of text documents | |
====================================================== | |
simplified source of | |
http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html | |
""" | |
from __future__ import print_function | |
from glob import glob | |
import itertools | |
import os.path | |
import re | |
import time | |
import numpy as np | |
from sklearn.externals.six.moves import html_parser | |
from sklearn.externals.six.moves import urllib | |
from sklearn.datasets import get_data_home | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
class PubAnnounceParser(html_parser.HTMLParser): | |
"""Utility class to parse a SGML file and yield documents one at a time.""" | |
def __init__(self, encoding='utf-8'): | |
html_parser.HTMLParser.__init__(self) | |
self._reset() | |
self.encoding = encoding | |
def handle_starttag(self, tag, attrs): | |
# call start_html ... | |
method = 'start_' + tag | |
getattr(self, method, lambda x: None)(attrs) | |
def handle_endtag(self, tag): | |
# call end_html ... | |
method = 'end_' + tag | |
getattr(self, method, lambda: None)() | |
def _reset(self): | |
self.body = '' | |
def parse(self, fd): | |
self.docs = [] | |
for chunk in fd: | |
self.feed(chunk.decode(self.encoding)) | |
for doc in self.docs: | |
yield doc | |
self.docs = [] | |
self.close() | |
def handle_data(self, data): | |
self.body += data | |
""" | |
---------------------------------------------------- | |
handle_starttag and handle_endtag related methods | |
---------------------------------------------------- | |
""" | |
def start_html(self, attributes): | |
pass | |
def end_html(self): | |
self.body = re.sub(r'\s+', r' ', self.body) | |
self.docs.append({'title': '', | |
'body': self.body}) | |
self._reset() | |
def streamPubAnounceDocs(data_path=None): | |
"""Iterate over documents of the Stock Public announcement dataset. | |
Documents are represented as dictionaries with 'body' (str), | |
'title' (str), 'topics' (list(str)) keys. | |
""" | |
parser = PubAnnounceParser() | |
data_path = "pubann" | |
for filename in glob(os.path.join(data_path, "*.htm")): | |
for doc in parser.parse(open(filename, 'rb')): | |
yield doc | |
positive_class = 'any' | |
def get_minibatch(doc_iter, size, pos_class=positive_class): | |
"""Extract a minibatch of examples, return a tuple X_text, y. | |
Note: size is before excluding invalid docs with no topics assigned. | |
""" | |
data = [] | |
posOrNeg = True | |
for doc in itertools.islice(doc_iter, size): | |
cls = True | |
if doc['body'][1:3] == u'Swim' : | |
cls = False | |
data.append((u'{title}\n\n{body}'.format(**doc), cls)) | |
if not len(data): | |
return np.asarray([], dtype=int), np.asarray([], dtype=int) | |
X_text, y = zip(*data) # [(1,2), (3,4), (5,6)] --> [(1,3,5),(2,4,6)] | |
return X_text, np.asarray(y, dtype=int) | |
def iter_minibatches_tmp(doc_iter, minibatch_size): | |
"""Generator of minibatches.""" | |
print(doc_iter) | |
X_text, y = get_minibatch(doc_iter, minibatch_size) | |
while len(X_text): | |
yield X_text, y | |
X_text, y = get_minibatch(doc_iter, minibatch_size) | |
def progress(cls_name, stats, test_stats): | |
"""Report progress information, return a string.""" | |
duration = time.time() - stats['t0'] | |
s = "%20s classifier : \t" % cls_name | |
s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats | |
s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats | |
s += "accuracy: %(accuracy).3f " % stats | |
s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration) | |
return s | |
def myMain(): | |
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 20, | |
non_negative=True) | |
# TODO | |
# 1. store the current partial-fitted data to restore later use | |
# 2. everyday do the partial-fit for newly generated data | |
# | |
data_stream = streamPubAnounceDocs() | |
minibatch_size = 1000 | |
minibatch_iterators = iter_minibatches_tmp(data_stream, minibatch_size) | |
all_classes = np.array([0, 1]) | |
cls_stats = {} | |
cls_name = 'NB Multinomial' | |
stats = {'n_train': 0, 'n_train_pos': 0, | |
'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(), | |
'runtime_history': [(0, 0)], 'total_fit_time': 0.0} | |
cls_stats[cls_name] = stats | |
test_stats = {'n_test': 0, 'n_test_pos': 0} | |
test_stats['n_test'] += 0 # len(y_test) | |
test_stats['n_test_pos'] += 0 # sum(y_test) | |
# fit | |
total_vect_time = 0.0 | |
cls = MultinomialNB(alpha=0.01) | |
for i, (X_train_text, y_train) in enumerate(minibatch_iterators): | |
tick = time.time() | |
X_train = vectorizer.transform(X_train_text) | |
total_vect_time += time.time() - tick | |
tick = time.time() | |
# update estimator with examples in the current mini-batch | |
cls.partial_fit(X_train, y_train, classes=all_classes) | |
# keep this for debugging | |
# accumulate test accuracy stats | |
cls_stats[cls_name]['total_fit_time'] += time.time() - tick | |
cls_stats[cls_name]['n_train'] += X_train.shape[0] | |
cls_stats[cls_name]['n_train_pos'] += sum(y_train) | |
tick = time.time() | |
#cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test) | |
cls_stats[cls_name]['prediction_time'] = time.time() - tick | |
acc_history = (cls_stats[cls_name]['accuracy'], | |
cls_stats[cls_name]['n_train']) | |
cls_stats[cls_name]['accuracy_history'].append(acc_history) | |
run_history = (cls_stats[cls_name]['accuracy'], | |
total_vect_time + cls_stats[cls_name]['total_fit_time']) | |
cls_stats[cls_name]['runtime_history'].append(run_history) | |
if i % 3 == 0: | |
print(progress(cls_name, cls_stats[cls_name], test_stats)) | |
print('\n') | |
#------------------------------------ | |
# Predict | |
#------------------------------------ | |
tt = "Swim" | |
X_pred = vectorizer.transform((tt,)) | |
print("predict : %s "%(cls.predict(X_pred),)) | |
if __name__ == '__main__': | |
#main() | |
myMain() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment