starsinmypockets/importer.py

## importer.py
""" Imports json in a standard format to Django objects """

import json, re, logging, sqlite3
from redisclg import RedisBayes
logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG)

class BaseImporter(object):
    # @@TODO add file_path param to init
    def __init__(self, src, list_id, db_src="clg_proj.sqlite3"):
        self.list_id = list_id
        self.rb = RedisBayes()
        try:
            self.json_string = open(src, 'r').read().strip()
            try:
                self.json_data = json.loads(self.json_string)
                self.categories = self.json_data.keys()
                self.validated_items = []
            except ValueError, e:
                print "Error importing data.", e
        except IOError:
            print 'cannot open'
        except ValueError,e:
            print 'Error: ' + e
        try:
            self.conn = sqlite3.connect(db_src)
            self.c = self.conn.cursor()
        except e:
            print 'Oops - database error: ' + e

    def import_category(self, category):
        for item_raw in self.json_data[category]:
            clean_item = self._get_clean_item(item_raw)
            pk = self._db_insert(clean_item)
            text = (clean_item['title_raw'] + ' ' + clean_item['description_raw'])
            self.rb.item_add_kws(pk, text)
        self.conn.commit()

    def import_all(self):
        for category in self.categories:
            self.import_category(category)

    def _get_clean_item(self, item_raw):
        pv = PriceValidator(item_raw)
        clean_item = {
            'title_raw' : item_raw['title'],
            'description_raw' : item_raw['description'],
            'price_raw' : item_raw['price'],
            'tokens' : self.rb.tokenizer(set((item_raw['title'] + item_raw['description']))),
            'price_cents' : pv.get_price_in_cents(),
            'unit_type' : pv.get_unit_type()['unit_type'],
            'unit_val' : pv.get_unit_type()['val'],
        }
        try:
            clean_item['unit_price'] = int(pv.get_price_in_cents())/int(pv.get_unit_type()['val'])
        except:
            clean_item['unit_price'] = None,
        return clean_item

    def _tidy(self, text):
        if not isinstance(text, basestring):
            text = str(text)
        if not isinstance(text, unicode):
            text = text.decode('utf8')
        text = text.lower()
        return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)

    def _db_insert(self, clean_item):
        try:
            self.c.execute('INSERT INTO main_item ("title_raw", "description_raw", "price_raw", "price_in_cents", "unit_type", "unit_val", "product_list_id") VALUES (?, ?, ?, ?, ?, ?, ?)', (clean_item['title_raw'], clean_item['description_raw'], clean_item['price_raw'], clean_item['price_cents'], str(clean_item['unit_type']), clean_item['unit_val'], self.list_id))
        except:
            print 'Error inserting record: '
            for item in clean_item:
                print item+'                            =>      ', clean_item[item]
            print '\n'
        return self.c.lastrowid

class PriceValidator(object):
    """ Takes a price string and validates price, unit type, and unit value. """
    def __init__(self, item_raw):
        try:
            self.price_string = str(item_raw['price'])
            self.description = str(item_raw['description'])
        except ValueError, e:
            print 'Sorry, only string values accepted. ', e

    # @@TODO - Needs 'X FOR $Y' form
    def get_price_in_cents(self):
        """ Return an integer representing a price in cents """
        price_string = self.price_string
        price_rgx = re.compile(r"[0-9]+(\.[0-9]{2})")
        try:
            x = price_rgx.search(price_string).group()
            return int((float(x))*100)
        except:
            return None

    def get_unit_val(self):
        pass

    def get_unit_type(self):

        oz_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)? to ([0-9]+(\.[0-9]+)?([ ]|[-])?(oz)?)")
        oz_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-]){1}((oz)|(fl)|(fl-oz)){1}")
        lb_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-]){1}((lb)){1}")
        oz_lb_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-])?oz to ([0-9]+(\.[0-9]+)?([ ]|[-])?((lb)|(LB)))")
        ct_rgx = re.compile(r"[0-9]+([ ]|[-])(ct)")
        quart_rgx = re.compile(r"[0-9]+([ ]|[-])+((qt)|(Qt)|(quart)|(Quart))")

        try:
            if oz_range_rgx.search(self.description):
                oz_range = oz_range_rgx.search(self.description).group()
                m = re.search(r" to ", oz_range)
                return {
                    'unit_type' : 'oz_range',
                    'val' : None,
                    'oz_range_lo' : oz_range[:m.start()],
                    'oz_range_hi' : oz_range[m.end():].lower().replace('-oz', '')
                }
            # oz. flat rate
            elif oz_flat_rgx.search(self.description):
                return {
                    'unit_type' : 'oz',
                    'val' : int(re.search(r"[0-9]+",oz_flat_rgx.search(self.description).group()).group()),
            }
            # oz. -> lb. rate
            elif oz_lb_range_rgx.search(self.description):
                oz_lb_range = oz_lb_range_rgx.search(self.description).group()
                m = re.search(r" to ", oz_lb_range)
                return {
                    'unit_type' : 'oz_lb_range',
                    'val' : None,
                    'oz_lb_range_oz' : oz_lb_range[:m.start()],
                    'oz_lb_range_lb' : oz_lb_range[m.end():],
                }
            # flat lb. weight
            elif lb_flat_rgx.search(record['description']):
                return {
                    'unit_type' : 'lb',
                    'val' : int(re.search(r"[0-9]+", lb_flat_rgx.search(self.description).group()).group()),
                }
            # Number count
            elif ct_rgx.search(self.description):
                ct_numb = int(re.search(r"[0-9]+", ct_rgx.search(self.description).group()).group())
                return {
                    'unit_type' : 'count',
                    'val' : ct_numb,
                }
            # quart volume
            elif quart_rgx.search(record['description']):
                return {
                    'unit_type' : 'quart',
                    'val' : quart_rgx.search(self.description).group(),
                }
            # no valid unit information:
            else:
                empty_count += 1
                return {
                    'unit_type' : None,
                    'val' : None,
                }
        except:
            return {
                'unit_type' : None,
                'val' : None,
            }

## redis.py
# -*- coding: utf-8 -*-
#
# Thaks to Jart for substantially building the boilerplate for this implementation:
#
# redisbayes - Na•ve Bayesian Text Classifier on Redis
# Copyright (c) 2012 Justine Alexandra Roberts Tunney
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import re, math, logging

import logging
logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG)

__version__ = "0.1.3"

english_ignore = set("""
a able about above abroad according accordingly across actually adj after
afterwards again against ago ahead ain't all allow allows almost alone along
alongside already also although always am amid amidst among amongst an and
another any anybody anyhow anyone anything anyway anyways anywhere apart
appear appreciate appropriate are aren't around as a's aside ask asking
associated at available away awfully b back backward backwards be became
because become becomes becoming been before beforehand begin behind being
believe below beside besides best better between beyond both brief but by c
came can cannot cant can't caption cause causes certain certainly changes
clearly c'mon co co. com come comes concerning consequently consider
considering contain containing contains corresponding could couldn't course
c's currently d dare daren't definitely described despite did didn't different
directly do does doesn't doing done don't down downwards during e each edu eg
eight eighty either else elsewhere end ending enough entirely especially et
etc even ever evermore every everybody everyone everything everywhere ex
exactly example except f fairly far farther few fewer fifth first five
followed following follows for forever former formerly forth forward found
four from further furthermore g get gets getting given gives go goes going
gone got gotten greetings h had hadn't half happens hardly has hasn't have
haven't having he he'd he'll hello help hence her here hereafter hereby herein
here's hereupon hers herself he's hi him himself his hither hopefully how
howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc
inc. indeed indicate indicated indicates inner inside insofar instead into
inward is isn't it it'd it'll its it's itself i've j just k keep keeps kept
know known knows l last lately later latter latterly least less lest let let's
like liked likely likewise little look looking looks low lower ltd m made
mainly make makes many may maybe mayn't me mean meantime meanwhile merely
might mightn't mine minus miss more moreover most mostly mr mrs much must
mustn't my myself n name namely nd near nearly necessary need needn't needs
neither never neverf neverless nevertheless new next nine ninety no nobody non
none nonetheless noone no-one nor normally not nothing notwithstanding novel
now nowhere o obviously of off often oh ok okay old on once one ones one's
only onto opposite or other others otherwise ought oughtn't our ours ourselves
out outside over overall own p particular particularly past per perhaps placed
please plus possible presumably probably provided provides q que quite qv r
rather rd re really reasonably recent recently regarding regardless regards
relatively respectively right round s said same saw say saying says second
secondly see seeing seem seemed seeming seems seen self selves sensible sent
serious seriously seven several shall shan't she she'd she'll she's should
shouldn't since six so some somebody someday somehow someone something
sometime sometimes somewhat somewhere soon sorry specified specify specifying
still sub such sup sure t take taken taking tell tends th than thank thanks
thanx that that'll thats that's that've the their theirs them themselves then
thence there thereafter thereby there'd therefore therein there'll there're
theres there's thereupon there've these they they'd they'll they're they've
thing things think third thirty this thorough thoroughly those though three
through throughout thru thus till to together too took toward towards tried
tries truly try trying t's twice two u un under underneath undoing
unfortunately unless unlike unlikely until unto up upon upwards us use used
useful uses using usually v value various versus very via viz vs w want wants
was wasn't way we we'd welcome well we'll went were we're weren't we've what
whatever what'll what's what've when whence whenever where whereafter whereas
whereby wherein where's whereupon wherever whether which whichever while
whilst whither who who'd whoever whole who'll whom whomever who's whose why
will willing wish with within without wonder won't would wouldn't x y yes yet
you you'd you'll your you're yours yourself yourselves you've z zero
successful greatest began including being all for close but
""".split())


def tidy(text):
    logging.debug('>>>tidy>>>')
    logging.debug(text)
    logging.debug('<<<tidy<<<')
    if not isinstance(text, basestring):
        text = str(text)
    if not isinstance(text, unicode):
        text = text.decode('utf8')
    text = text.lower()
    return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)

def english_tokenizer(text):
    words = tidy(text).split()
    return [w for w in words if len(w) > 2 and w not in english_ignore]

def occurances(words):
    counts = {}
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    return counts

class RedisBayes(object):
    def __init__(self, redis=None, prefix='clg:', correction=0.1,
                 tokenizer=None):
        self.redis = redis
        self.prefix = prefix
        self.correction = correction
        self.tokenizer = tokenizer or english_tokenizer
        if not self.redis:
            from redis import Redis
            self.redis = Redis()

    def flush_all(self):
        for cat in self.redis.smembers(self.prefix + 'categories'):
            self.redis.delete(self.prefix + cat)
        self.redis.delete(self.prefix + 'categories')

    # Given the title of a canonical item, flush all keywords that reference it
    def flush_item(self, item):
        pass

    # Given a category name, flush all associated items
    def flush_category(self, cat):
        pass

    def flush_item(self, item):
        pass

    def item_add_kws(self, pk, text):
        tokens = self.tokenizer(text)
        logging.debug('>>>tokens>>>')
        logging.debug(tokens)
        bigrams = self._get_bigrams(tokens)
        logging.debug('>>>bigrams>>>')
        logging.debug(bigrams)
        for token in tokens:
            logging.debug('token set name: ' + self.prefix+str(pk))
            logging.debug('token Insert result: ')
            logging.debug(self.redis.sadd(self.prefix + str(pk), token))
        for bigram in bigrams:
            logging.debug('bigram set name: ' + self.prefix+str(pk))
            logging.debug('bigram Insert result: ')
            logging.debug(self.redis.sadd(self.prefix + str(pk), bigram))
    def _get_bigrams(self,tokens):
        bigrams = []
        i=0
        while i < (len(tokens)-1):
            bigrams.append(tokens[i] + ' ' + tokens[i+1])
            i += 1
        return bigrams
    # Associate keywords with an item
    def associate_item(self, item, **kws):
        pass

    # Associate keywords with a category
    def associate_category(self, category, **kws):
        pass

    def score(self, text):
        occurs = occurances(self.tokenizer(text))
        scores = {}
        for category in self.redis.smembers(self.prefix + 'categories'):
            tally = self.tally(category)
            if tally == 0:
                continue
            scores[category] = 0.0
            for word, count in occurs.iteritems():
                score = self.redis.hget(self.prefix + category, word)
                assert not score or score > 0, "corrupt bayesian database"
                score = score or self.correction
                scores[category] += math.log(float(score) / tally)
        return scores

    def tally(self, category):
        tally = sum(int(x) for x in self.redis.hvals(self.prefix + category))
        assert tally >= 0, "corrupt bayesian database"
        return tally

if __name__ == '__main__':
    import doctest
    doctest.testmod()
	""" Imports json in a standard format to Django objects """

	import json, re, logging, sqlite3
	from redisclg import RedisBayes
	logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG)

	class BaseImporter(object):
	# @@TODO add file_path param to init
	def __init__(self, src, list_id, db_src="clg_proj.sqlite3"):
	self.list_id = list_id
	self.rb = RedisBayes()
	try:
	self.json_string = open(src, 'r').read().strip()
	try:
	self.json_data = json.loads(self.json_string)
	self.categories = self.json_data.keys()
	self.validated_items = []
	except ValueError, e:
	print "Error importing data.", e
	except IOError:
	print 'cannot open'
	except ValueError,e:
	print 'Error: ' + e
	try:
	self.conn = sqlite3.connect(db_src)
	self.c = self.conn.cursor()
	except e:
	print 'Oops - database error: ' + e

	def import_category(self, category):
	for item_raw in self.json_data[category]:
	clean_item = self._get_clean_item(item_raw)
	pk = self._db_insert(clean_item)
	text = (clean_item['title_raw'] + ' ' + clean_item['description_raw'])
	self.rb.item_add_kws(pk, text)
	self.conn.commit()

	def import_all(self):
	for category in self.categories:
	self.import_category(category)

	def _get_clean_item(self, item_raw):
	pv = PriceValidator(item_raw)
	clean_item = {
	'title_raw' : item_raw['title'],
	'description_raw' : item_raw['description'],
	'price_raw' : item_raw['price'],
	'tokens' : self.rb.tokenizer(set((item_raw['title'] + item_raw['description']))),
	'price_cents' : pv.get_price_in_cents(),
	'unit_type' : pv.get_unit_type()['unit_type'],
	'unit_val' : pv.get_unit_type()['val'],
	}
	try:
	clean_item['unit_price'] = int(pv.get_price_in_cents())/int(pv.get_unit_type()['val'])
	except:
	clean_item['unit_price'] = None,
	return clean_item

	def _tidy(self, text):
	if not isinstance(text, basestring):
	text = str(text)
	if not isinstance(text, unicode):
	text = text.decode('utf8')
	text = text.lower()
	return re.sub(r'[\_.,<>:;~+\|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)

	def _db_insert(self, clean_item):
	try:
	self.c.execute('INSERT INTO main_item ("title_raw", "description_raw", "price_raw", "price_in_cents", "unit_type", "unit_val", "product_list_id") VALUES (?, ?, ?, ?, ?, ?, ?)', (clean_item['title_raw'], clean_item['description_raw'], clean_item['price_raw'], clean_item['price_cents'], str(clean_item['unit_type']), clean_item['unit_val'], self.list_id))
	except:
	print 'Error inserting record: '
	for item in clean_item:
	print item+' => ', clean_item[item]
	print '\n'
	return self.c.lastrowid

	class PriceValidator(object):
	""" Takes a price string and validates price, unit type, and unit value. """
	def __init__(self, item_raw):
	try:
	self.price_string = str(item_raw['price'])
	self.description = str(item_raw['description'])
	except ValueError, e:
	print 'Sorry, only string values accepted. ', e

	# @@TODO - Needs 'X FOR $Y' form
	def get_price_in_cents(self):
	""" Return an integer representing a price in cents """
	price_string = self.price_string
	price_rgx = re.compile(r"[0-9]+(\.[0-9]{2})")
	try:
	x = price_rgx.search(price_string).group()
	return int((float(x))*100)
	except:
	return None

	def get_unit_val(self):
	pass

	def get_unit_type(self):

	oz_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)? to ([0-9]+(\.[0-9]+)?([ ]\|[-])?(oz)?)")
	oz_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]\|[-]){1}((oz)\|(fl)\|(fl-oz)){1}")
	lb_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]\|[-]){1}((lb)){1}")
	oz_lb_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]\|[-])?oz to ([0-9]+(\.[0-9]+)?([ ]\|[-])?((lb)\|(LB)))")
	ct_rgx = re.compile(r"[0-9]+([ ]\|[-])(ct)")
	quart_rgx = re.compile(r"[0-9]+([ ]\|[-])+((qt)\|(Qt)\|(quart)\|(Quart))")

	try:
	if oz_range_rgx.search(self.description):
	oz_range = oz_range_rgx.search(self.description).group()
	m = re.search(r" to ", oz_range)
	return {
	'unit_type' : 'oz_range',
	'val' : None,
	'oz_range_lo' : oz_range[:m.start()],
	'oz_range_hi' : oz_range[m.end():].lower().replace('-oz', '')
	}
	# oz. flat rate
	elif oz_flat_rgx.search(self.description):
	return {
	'unit_type' : 'oz',
	'val' : int(re.search(r"[0-9]+",oz_flat_rgx.search(self.description).group()).group()),
	}
	# oz. -> lb. rate
	elif oz_lb_range_rgx.search(self.description):
	oz_lb_range = oz_lb_range_rgx.search(self.description).group()
	m = re.search(r" to ", oz_lb_range)
	return {
	'unit_type' : 'oz_lb_range',
	'val' : None,
	'oz_lb_range_oz' : oz_lb_range[:m.start()],
	'oz_lb_range_lb' : oz_lb_range[m.end():],
	}
	# flat lb. weight
	elif lb_flat_rgx.search(record['description']):
	return {
	'unit_type' : 'lb',
	'val' : int(re.search(r"[0-9]+", lb_flat_rgx.search(self.description).group()).group()),
	}
	# Number count
	elif ct_rgx.search(self.description):
	ct_numb = int(re.search(r"[0-9]+", ct_rgx.search(self.description).group()).group())
	return {
	'unit_type' : 'count',
	'val' : ct_numb,
	}
	# quart volume
	elif quart_rgx.search(record['description']):
	return {
	'unit_type' : 'quart',
	'val' : quart_rgx.search(self.description).group(),
	}
	# no valid unit information:
	else:
	empty_count += 1
	return {
	'unit_type' : None,
	'val' : None,
	}
	except:
	return {
	'unit_type' : None,
	'val' : None,
	}
	# -- coding: utf-8 --
	#
	# Thaks to Jart for substantially building the boilerplate for this implementation:
	#
	# redisbayes - Na•ve Bayesian Text Classifier on Redis
	# Copyright (c) 2012 Justine Alexandra Roberts Tunney
	#
	# Permission is hereby granted, free of charge, to any person
	# obtaining a copy of this software and associated documentation
	# files (the "Software"), to deal in the Software without
	# restriction, including without limitation the rights to use, copy,
	# modify, merge, publish, distribute, sublicense, and/or sell copies
	# of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	#
	import re, math, logging

	import logging
	logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG)

	__version__ = "0.1.3"

	english_ignore = set("""
	a able about above abroad according accordingly across actually adj after
	afterwards again against ago ahead ain't all allow allows almost alone along
	alongside already also although always am amid amidst among amongst an and
	another any anybody anyhow anyone anything anyway anyways anywhere apart
	appear appreciate appropriate are aren't around as a's aside ask asking
	associated at available away awfully b back backward backwards be became
	because become becomes becoming been before beforehand begin behind being
	believe below beside besides best better between beyond both brief but by c
	came can cannot cant can't caption cause causes certain certainly changes
	clearly c'mon co co. com come comes concerning consequently consider
	considering contain containing contains corresponding could couldn't course
	c's currently d dare daren't definitely described despite did didn't different
	directly do does doesn't doing done don't down downwards during e each edu eg
	eight eighty either else elsewhere end ending enough entirely especially et
	etc even ever evermore every everybody everyone everything everywhere ex
	exactly example except f fairly far farther few fewer fifth first five
	followed following follows for forever former formerly forth forward found
	four from further furthermore g get gets getting given gives go goes going
	gone got gotten greetings h had hadn't half happens hardly has hasn't have
	haven't having he he'd he'll hello help hence her here hereafter hereby herein
	here's hereupon hers herself he's hi him himself his hither hopefully how
	howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc
	inc. indeed indicate indicated indicates inner inside insofar instead into
	inward is isn't it it'd it'll its it's itself i've j just k keep keeps kept
	know known knows l last lately later latter latterly least less lest let let's
	like liked likely likewise little look looking looks low lower ltd m made
	mainly make makes many may maybe mayn't me mean meantime meanwhile merely
	might mightn't mine minus miss more moreover most mostly mr mrs much must
	mustn't my myself n name namely nd near nearly necessary need needn't needs
	neither never neverf neverless nevertheless new next nine ninety no nobody non
	none nonetheless noone no-one nor normally not nothing notwithstanding novel
	now nowhere o obviously of off often oh ok okay old on once one ones one's
	only onto opposite or other others otherwise ought oughtn't our ours ourselves
	out outside over overall own p particular particularly past per perhaps placed
	please plus possible presumably probably provided provides q que quite qv r
	rather rd re really reasonably recent recently regarding regardless regards
	relatively respectively right round s said same saw say saying says second
	secondly see seeing seem seemed seeming seems seen self selves sensible sent
	serious seriously seven several shall shan't she she'd she'll she's should
	shouldn't since six so some somebody someday somehow someone something
	sometime sometimes somewhat somewhere soon sorry specified specify specifying
	still sub such sup sure t take taken taking tell tends th than thank thanks
	thanx that that'll thats that's that've the their theirs them themselves then
	thence there thereafter thereby there'd therefore therein there'll there're
	theres there's thereupon there've these they they'd they'll they're they've
	thing things think third thirty this thorough thoroughly those though three
	through throughout thru thus till to together too took toward towards tried
	tries truly try trying t's twice two u un under underneath undoing
	unfortunately unless unlike unlikely until unto up upon upwards us use used
	useful uses using usually v value various versus very via viz vs w want wants
	was wasn't way we we'd welcome well we'll went were we're weren't we've what
	whatever what'll what's what've when whence whenever where whereafter whereas
	whereby wherein where's whereupon wherever whether which whichever while
	whilst whither who who'd whoever whole who'll whom whomever who's whose why
	will willing wish with within without wonder won't would wouldn't x y yes yet
	you you'd you'll your you're yours yourself yourselves you've z zero
	successful greatest began including being all for close but
	""".split())


	def tidy(text):
	logging.debug('>>>tidy>>>')
	logging.debug(text)
	logging.debug('<<<tidy<<<')
	if not isinstance(text, basestring):
	text = str(text)
	if not isinstance(text, unicode):
	text = text.decode('utf8')
	text = text.lower()
	return re.sub(r'[\_.,<>:;~+\|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)

	def english_tokenizer(text):
	words = tidy(text).split()
	return [w for w in words if len(w) > 2 and w not in english_ignore]

	def occurances(words):
	counts = {}
	for word in words:
	if word in counts:
	counts[word] += 1
	else:
	counts[word] = 1
	return counts

	class RedisBayes(object):
	def __init__(self, redis=None, prefix='clg:', correction=0.1,
	tokenizer=None):
	self.redis = redis
	self.prefix = prefix
	self.correction = correction
	self.tokenizer = tokenizer or english_tokenizer
	if not self.redis:
	from redis import Redis
	self.redis = Redis()

	def flush_all(self):
	for cat in self.redis.smembers(self.prefix + 'categories'):
	self.redis.delete(self.prefix + cat)
	self.redis.delete(self.prefix + 'categories')

	# Given the title of a canonical item, flush all keywords that reference it
	def flush_item(self, item):
	pass

	# Given a category name, flush all associated items
	def flush_category(self, cat):
	pass

	def flush_item(self, item):
	pass

	def item_add_kws(self, pk, text):
	tokens = self.tokenizer(text)
	logging.debug('>>>tokens>>>')
	logging.debug(tokens)
	bigrams = self._get_bigrams(tokens)
	logging.debug('>>>bigrams>>>')
	logging.debug(bigrams)
	for token in tokens:
	logging.debug('token set name: ' + self.prefix+str(pk))
	logging.debug('token Insert result: ')
	logging.debug(self.redis.sadd(self.prefix + str(pk), token))
	for bigram in bigrams:
	logging.debug('bigram set name: ' + self.prefix+str(pk))
	logging.debug('bigram Insert result: ')
	logging.debug(self.redis.sadd(self.prefix + str(pk), bigram))
	def _get_bigrams(self,tokens):
	bigrams = []
	i=0
	while i < (len(tokens)-1):
	bigrams.append(tokens[i] + ' ' + tokens[i+1])
	i += 1
	return bigrams
	# Associate keywords with an item
	def associate_item(self, item, **kws):
	pass

	# Associate keywords with a category
	def associate_category(self, category, **kws):
	pass

	def score(self, text):
	occurs = occurances(self.tokenizer(text))
	scores = {}
	for category in self.redis.smembers(self.prefix + 'categories'):
	tally = self.tally(category)
	if tally == 0:
	continue
	scores[category] = 0.0
	for word, count in occurs.iteritems():
	score = self.redis.hget(self.prefix + category, word)
	assert not score or score > 0, "corrupt bayesian database"
	score = score or self.correction
	scores[category] += math.log(float(score) / tally)
	return scores

	def tally(self, category):
	tally = sum(int(x) for x in self.redis.hvals(self.prefix + category))
	assert tally >= 0, "corrupt bayesian database"
	return tally

	if __name__ == '__main__':
	import doctest
	doctest.testmod()