Skip to content

Instantly share code, notes, and snippets.

@i5on9i
Last active June 27, 2016 15:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save i5on9i/862fe1dd7ce83fc0bb91c6da367bc6ba to your computer and use it in GitHub Desktop.
Save i5on9i/862fe1dd7ce83fc0bb91c6da367bc6ba to your computer and use it in GitHub Desktop.
how to use out-of-core ver. naiveBayes of scikit-learn library
"""
======================================================
Out-of-core classification of text documents
======================================================
simplified source of
http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html
"""
from __future__ import print_function
from glob import glob
import itertools
import os.path
import re
import time
import numpy as np
from sklearn.externals.six.moves import html_parser
from sklearn.externals.six.moves import urllib
from sklearn.datasets import get_data_home
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
class PubAnnounceParser(html_parser.HTMLParser):
"""Utility class to parse a SGML file and yield documents one at a time."""
def __init__(self, encoding='utf-8'):
html_parser.HTMLParser.__init__(self)
self._reset()
self.encoding = encoding
def handle_starttag(self, tag, attrs):
# call start_html ...
method = 'start_' + tag
getattr(self, method, lambda x: None)(attrs)
def handle_endtag(self, tag):
# call end_html ...
method = 'end_' + tag
getattr(self, method, lambda: None)()
def _reset(self):
self.body = ''
def parse(self, fd):
self.docs = []
for chunk in fd:
self.feed(chunk.decode(self.encoding))
for doc in self.docs:
yield doc
self.docs = []
self.close()
def handle_data(self, data):
self.body += data
"""
----------------------------------------------------
handle_starttag and handle_endtag related methods
----------------------------------------------------
"""
def start_html(self, attributes):
pass
def end_html(self):
self.body = re.sub(r'\s+', r' ', self.body)
self.docs.append({'title': '',
'body': self.body})
self._reset()
def streamPubAnounceDocs(data_path=None):
"""Iterate over documents of the Stock Public announcement dataset.
Documents are represented as dictionaries with 'body' (str),
'title' (str), 'topics' (list(str)) keys.
"""
parser = PubAnnounceParser()
data_path = "pubann"
for filename in glob(os.path.join(data_path, "*.htm")):
for doc in parser.parse(open(filename, 'rb')):
yield doc
positive_class = 'any'
def get_minibatch(doc_iter, size, pos_class=positive_class):
"""Extract a minibatch of examples, return a tuple X_text, y.
Note: size is before excluding invalid docs with no topics assigned.
"""
data = []
posOrNeg = True
for doc in itertools.islice(doc_iter, size):
cls = True
if doc['body'][1:3] == u'Swim' :
cls = False
data.append((u'{title}\n\n{body}'.format(**doc), cls))
if not len(data):
return np.asarray([], dtype=int), np.asarray([], dtype=int)
X_text, y = zip(*data) # [(1,2), (3,4), (5,6)] --> [(1,3,5),(2,4,6)]
return X_text, np.asarray(y, dtype=int)
def iter_minibatches_tmp(doc_iter, minibatch_size):
"""Generator of minibatches."""
print(doc_iter)
X_text, y = get_minibatch(doc_iter, minibatch_size)
while len(X_text):
yield X_text, y
X_text, y = get_minibatch(doc_iter, minibatch_size)
def progress(cls_name, stats, test_stats):
"""Report progress information, return a string."""
duration = time.time() - stats['t0']
s = "%20s classifier : \t" % cls_name
s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
s += "accuracy: %(accuracy).3f " % stats
s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
return s
def myMain():
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 20,
non_negative=True)
# TODO
# 1. store the current partial-fitted data to restore later use
# 2. everyday do the partial-fit for newly generated data
#
data_stream = streamPubAnounceDocs()
minibatch_size = 1000
minibatch_iterators = iter_minibatches_tmp(data_stream, minibatch_size)
all_classes = np.array([0, 1])
cls_stats = {}
cls_name = 'NB Multinomial'
stats = {'n_train': 0, 'n_train_pos': 0,
'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),
'runtime_history': [(0, 0)], 'total_fit_time': 0.0}
cls_stats[cls_name] = stats
test_stats = {'n_test': 0, 'n_test_pos': 0}
test_stats['n_test'] += 0 # len(y_test)
test_stats['n_test_pos'] += 0 # sum(y_test)
# fit
total_vect_time = 0.0
cls = MultinomialNB(alpha=0.01)
for i, (X_train_text, y_train) in enumerate(minibatch_iterators):
tick = time.time()
X_train = vectorizer.transform(X_train_text)
total_vect_time += time.time() - tick
tick = time.time()
# update estimator with examples in the current mini-batch
cls.partial_fit(X_train, y_train, classes=all_classes)
# keep this for debugging
# accumulate test accuracy stats
cls_stats[cls_name]['total_fit_time'] += time.time() - tick
cls_stats[cls_name]['n_train'] += X_train.shape[0]
cls_stats[cls_name]['n_train_pos'] += sum(y_train)
tick = time.time()
#cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
cls_stats[cls_name]['prediction_time'] = time.time() - tick
acc_history = (cls_stats[cls_name]['accuracy'],
cls_stats[cls_name]['n_train'])
cls_stats[cls_name]['accuracy_history'].append(acc_history)
run_history = (cls_stats[cls_name]['accuracy'],
total_vect_time + cls_stats[cls_name]['total_fit_time'])
cls_stats[cls_name]['runtime_history'].append(run_history)
if i % 3 == 0:
print(progress(cls_name, cls_stats[cls_name], test_stats))
print('\n')
#------------------------------------
# Predict
#------------------------------------
tt = "Swim"
X_pred = vectorizer.transform((tt,))
print("predict : %s "%(cls.predict(X_pred),))
if __name__ == '__main__':
#main()
myMain()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment