i5on9i/out_of_core_naive_bayes.py

## out_of_core_naive_bayes.py
"""
======================================================
Out-of-core classification of text documents
======================================================

simplified source of
http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html
"""


from __future__ import print_function


from glob import glob
import itertools
import os.path
import re
import time

import numpy as np

from sklearn.externals.six.moves import html_parser
from sklearn.externals.six.moves import urllib
from sklearn.datasets import get_data_home
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB


class PubAnnounceParser(html_parser.HTMLParser):
    """Utility class to parse a SGML file and yield documents one at a time."""

    def __init__(self, encoding='utf-8'):
        html_parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def handle_starttag(self, tag, attrs):
    	# call start_html ...
        method = 'start_' + tag
        getattr(self, method, lambda x: None)(attrs)

    def handle_endtag(self, tag):
    	# call end_html ...
        method = 'end_' + tag
        getattr(self, method, lambda: None)()

    def _reset(self):
    	self.body = ''

    def parse(self, fd):
        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_data(self, data):
        self.body += data


    """
    	----------------------------------------------------
    	handle_starttag and handle_endtag related methods
    	----------------------------------------------------
    """
    def start_html(self, attributes):
        pass

    def end_html(self):
        self.body = re.sub(r'\s+', r' ', self.body)
        self.docs.append({'title': '',
                          'body': self.body})
        self._reset()


def streamPubAnounceDocs(data_path=None):
    """Iterate over documents of the Stock Public announcement dataset.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """
    parser = PubAnnounceParser()
    data_path = "pubann"
    for filename in glob(os.path.join(data_path, "*.htm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc

positive_class = 'any'
def get_minibatch(doc_iter, size, pos_class=positive_class):
    """Extract a minibatch of examples, return a tuple X_text, y.

    Note: size is before excluding invalid docs with no topics assigned.

    """
    data = []
    posOrNeg = True
    for doc in itertools.islice(doc_iter, size):
        cls = True
        if doc['body'][1:3] == u'Swim' :
            cls = False

        data.append((u'{title}\n\n{body}'.format(**doc), cls))

    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
    X_text, y = zip(*data) # [(1,2), (3,4), (5,6)]  --> [(1,3,5),(2,4,6)]
    return X_text, np.asarray(y, dtype=int)


def iter_minibatches_tmp(doc_iter, minibatch_size):
    """Generator of minibatches."""
    print(doc_iter)
    X_text, y = get_minibatch(doc_iter, minibatch_size)
    while len(X_text):
        yield X_text, y
        X_text, y = get_minibatch(doc_iter, minibatch_size)

def progress(cls_name, stats, test_stats):
    """Report progress information, return a string."""
    duration = time.time() - stats['t0']
    s = "%20s classifier : \t" % cls_name
    s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
    s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s

def myMain():


    vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 20,
                                   non_negative=True)

    # TODO
    # 1. store the current partial-fitted data to restore later use
    # 2. everyday do the partial-fit for newly generated data
    #
    data_stream = streamPubAnounceDocs()
    minibatch_size = 1000
    minibatch_iterators = iter_minibatches_tmp(data_stream, minibatch_size)

    all_classes = np.array([0, 1])


    cls_stats = {}
    cls_name = 'NB Multinomial'
    stats = {'n_train': 0, 'n_train_pos': 0,
                'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),
                'runtime_history': [(0, 0)], 'total_fit_time': 0.0}
    cls_stats[cls_name] = stats

    test_stats = {'n_test': 0, 'n_test_pos': 0}
    test_stats['n_test'] += 0 # len(y_test)
    test_stats['n_test_pos'] += 0 # sum(y_test)


    # fit
    total_vect_time = 0.0
    cls = MultinomialNB(alpha=0.01)
    for i, (X_train_text, y_train) in enumerate(minibatch_iterators):

        tick = time.time()
        X_train = vectorizer.transform(X_train_text)
        total_vect_time += time.time() - tick

        tick = time.time()

        # update estimator with examples in the current mini-batch
        cls.partial_fit(X_train, y_train, classes=all_classes)

        # keep this for debugging
        # accumulate test accuracy stats
        cls_stats[cls_name]['total_fit_time'] += time.time() - tick
        cls_stats[cls_name]['n_train'] += X_train.shape[0]
        cls_stats[cls_name]['n_train_pos'] += sum(y_train)
        tick = time.time()
        #cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
        cls_stats[cls_name]['prediction_time'] = time.time() - tick
        acc_history = (cls_stats[cls_name]['accuracy'],
                        cls_stats[cls_name]['n_train'])
        cls_stats[cls_name]['accuracy_history'].append(acc_history)
        run_history = (cls_stats[cls_name]['accuracy'],
                        total_vect_time + cls_stats[cls_name]['total_fit_time'])
        cls_stats[cls_name]['runtime_history'].append(run_history)

        if i % 3 == 0:
            print(progress(cls_name, cls_stats[cls_name], test_stats))
            print('\n')


    #------------------------------------
    # Predict
    #------------------------------------
    tt = "Swim"
    X_pred = vectorizer.transform((tt,))
    print("predict : %s "%(cls.predict(X_pred),))


if __name__ == '__main__':
    #main()
    myMain()
	"""
	======================================================
	Out-of-core classification of text documents
	======================================================

	simplified source of
	http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html
	"""


	from __future__ import print_function


	from glob import glob
	import itertools
	import os.path
	import re
	import time

	import numpy as np

	from sklearn.externals.six.moves import html_parser
	from sklearn.externals.six.moves import urllib
	from sklearn.datasets import get_data_home
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.naive_bayes import MultinomialNB




	class PubAnnounceParser(html_parser.HTMLParser):
	"""Utility class to parse a SGML file and yield documents one at a time."""

	def __init__(self, encoding='utf-8'):
	html_parser.HTMLParser.__init__(self)
	self._reset()
	self.encoding = encoding

	def handle_starttag(self, tag, attrs):
	# call start_html ...
	method = 'start_' + tag
	getattr(self, method, lambda x: None)(attrs)

	def handle_endtag(self, tag):
	# call end_html ...
	method = 'end_' + tag
	getattr(self, method, lambda: None)()

	def _reset(self):
	self.body = ''

	def parse(self, fd):
	self.docs = []
	for chunk in fd:
	self.feed(chunk.decode(self.encoding))
	for doc in self.docs:
	yield doc
	self.docs = []
	self.close()

	def handle_data(self, data):
	self.body += data


	"""
	----------------------------------------------------
	handle_starttag and handle_endtag related methods
	----------------------------------------------------
	"""
	def start_html(self, attributes):
	pass

	def end_html(self):
	self.body = re.sub(r'\s+', r' ', self.body)
	self.docs.append({'title': '',
	'body': self.body})
	self._reset()




	def streamPubAnounceDocs(data_path=None):
	"""Iterate over documents of the Stock Public announcement dataset.

	Documents are represented as dictionaries with 'body' (str),
	'title' (str), 'topics' (list(str)) keys.

	"""
	parser = PubAnnounceParser()
	data_path = "pubann"
	for filename in glob(os.path.join(data_path, "*.htm")):
	for doc in parser.parse(open(filename, 'rb')):
	yield doc

	positive_class = 'any'
	def get_minibatch(doc_iter, size, pos_class=positive_class):
	"""Extract a minibatch of examples, return a tuple X_text, y.

	Note: size is before excluding invalid docs with no topics assigned.

	"""
	data = []
	posOrNeg = True
	for doc in itertools.islice(doc_iter, size):
	cls = True
	if doc['body'][1:3] == u'Swim' :
	cls = False

	data.append((u'{title}\n\n{body}'.format(**doc), cls))

	if not len(data):
	return np.asarray([], dtype=int), np.asarray([], dtype=int)
	X_text, y = zip(*data) # [(1,2), (3,4), (5,6)] --> [(1,3,5),(2,4,6)]
	return X_text, np.asarray(y, dtype=int)


	def iter_minibatches_tmp(doc_iter, minibatch_size):
	"""Generator of minibatches."""
	print(doc_iter)
	X_text, y = get_minibatch(doc_iter, minibatch_size)
	while len(X_text):
	yield X_text, y
	X_text, y = get_minibatch(doc_iter, minibatch_size)

	def progress(cls_name, stats, test_stats):
	"""Report progress information, return a string."""
	duration = time.time() - stats['t0']
	s = "%20s classifier : \t" % cls_name
	s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
	s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
	s += "accuracy: %(accuracy).3f " % stats
	s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
	return s

	def myMain():


	vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 20,
	non_negative=True)

	# TODO
	# 1. store the current partial-fitted data to restore later use
	# 2. everyday do the partial-fit for newly generated data
	#
	data_stream = streamPubAnounceDocs()
	minibatch_size = 1000
	minibatch_iterators = iter_minibatches_tmp(data_stream, minibatch_size)

	all_classes = np.array([0, 1])


	cls_stats = {}
	cls_name = 'NB Multinomial'
	stats = {'n_train': 0, 'n_train_pos': 0,
	'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),
	'runtime_history': [(0, 0)], 'total_fit_time': 0.0}
	cls_stats[cls_name] = stats

	test_stats = {'n_test': 0, 'n_test_pos': 0}
	test_stats['n_test'] += 0 # len(y_test)
	test_stats['n_test_pos'] += 0 # sum(y_test)


	# fit
	total_vect_time = 0.0
	cls = MultinomialNB(alpha=0.01)
	for i, (X_train_text, y_train) in enumerate(minibatch_iterators):

	tick = time.time()
	X_train = vectorizer.transform(X_train_text)
	total_vect_time += time.time() - tick

	tick = time.time()

	# update estimator with examples in the current mini-batch
	cls.partial_fit(X_train, y_train, classes=all_classes)

	# keep this for debugging
	# accumulate test accuracy stats
	cls_stats[cls_name]['total_fit_time'] += time.time() - tick
	cls_stats[cls_name]['n_train'] += X_train.shape[0]
	cls_stats[cls_name]['n_train_pos'] += sum(y_train)
	tick = time.time()
	#cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
	cls_stats[cls_name]['prediction_time'] = time.time() - tick
	acc_history = (cls_stats[cls_name]['accuracy'],
	cls_stats[cls_name]['n_train'])
	cls_stats[cls_name]['accuracy_history'].append(acc_history)
	run_history = (cls_stats[cls_name]['accuracy'],
	total_vect_time + cls_stats[cls_name]['total_fit_time'])
	cls_stats[cls_name]['runtime_history'].append(run_history)

	if i % 3 == 0:
	print(progress(cls_name, cls_stats[cls_name], test_stats))
	print('\n')


	#------------------------------------
	# Predict
	#------------------------------------
	tt = "Swim"
	X_pred = vectorizer.transform((tt,))
	print("predict : %s "%(cls.predict(X_pred),))


	if __name__ == '__main__':
	#main()
	myMain()