Created
January 30, 2013 02:26
-
-
Save starsinmypockets/4670060 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Imports json in a standard format to Django objects """ | |
import json, re, logging, sqlite3 | |
from redisclg import RedisBayes | |
logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG) | |
class BaseImporter(object): | |
# @@TODO add file_path param to init | |
def __init__(self, src, list_id, db_src="clg_proj.sqlite3"): | |
self.list_id = list_id | |
self.rb = RedisBayes() | |
try: | |
self.json_string = open(src, 'r').read().strip() | |
try: | |
self.json_data = json.loads(self.json_string) | |
self.categories = self.json_data.keys() | |
self.validated_items = [] | |
except ValueError, e: | |
print "Error importing data.", e | |
except IOError: | |
print 'cannot open' | |
except ValueError,e: | |
print 'Error: ' + e | |
try: | |
self.conn = sqlite3.connect(db_src) | |
self.c = self.conn.cursor() | |
except e: | |
print 'Oops - database error: ' + e | |
def import_category(self, category): | |
for item_raw in self.json_data[category]: | |
clean_item = self._get_clean_item(item_raw) | |
pk = self._db_insert(clean_item) | |
text = (clean_item['title_raw'] + ' ' + clean_item['description_raw']) | |
self.rb.item_add_kws(pk, text) | |
self.conn.commit() | |
def import_all(self): | |
for category in self.categories: | |
self.import_category(category) | |
def _get_clean_item(self, item_raw): | |
pv = PriceValidator(item_raw) | |
clean_item = { | |
'title_raw' : item_raw['title'], | |
'description_raw' : item_raw['description'], | |
'price_raw' : item_raw['price'], | |
'tokens' : self.rb.tokenizer(set((item_raw['title'] + item_raw['description']))), | |
'price_cents' : pv.get_price_in_cents(), | |
'unit_type' : pv.get_unit_type()['unit_type'], | |
'unit_val' : pv.get_unit_type()['val'], | |
} | |
try: | |
clean_item['unit_price'] = int(pv.get_price_in_cents())/int(pv.get_unit_type()['val']) | |
except: | |
clean_item['unit_price'] = None, | |
return clean_item | |
def _tidy(self, text): | |
if not isinstance(text, basestring): | |
text = str(text) | |
if not isinstance(text, unicode): | |
text = text.decode('utf8') | |
text = text.lower() | |
return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE) | |
def _db_insert(self, clean_item): | |
try: | |
self.c.execute('INSERT INTO main_item ("title_raw", "description_raw", "price_raw", "price_in_cents", "unit_type", "unit_val", "product_list_id") VALUES (?, ?, ?, ?, ?, ?, ?)', (clean_item['title_raw'], clean_item['description_raw'], clean_item['price_raw'], clean_item['price_cents'], str(clean_item['unit_type']), clean_item['unit_val'], self.list_id)) | |
except: | |
print 'Error inserting record: ' | |
for item in clean_item: | |
print item+' => ', clean_item[item] | |
print '\n' | |
return self.c.lastrowid | |
class PriceValidator(object): | |
""" Takes a price string and validates price, unit type, and unit value. """ | |
def __init__(self, item_raw): | |
try: | |
self.price_string = str(item_raw['price']) | |
self.description = str(item_raw['description']) | |
except ValueError, e: | |
print 'Sorry, only string values accepted. ', e | |
# @@TODO - Needs 'X FOR $Y' form | |
def get_price_in_cents(self): | |
""" Return an integer representing a price in cents """ | |
price_string = self.price_string | |
price_rgx = re.compile(r"[0-9]+(\.[0-9]{2})") | |
try: | |
x = price_rgx.search(price_string).group() | |
return int((float(x))*100) | |
except: | |
return None | |
def get_unit_val(self): | |
pass | |
def get_unit_type(self): | |
oz_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)? to ([0-9]+(\.[0-9]+)?([ ]|[-])?(oz)?)") | |
oz_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-]){1}((oz)|(fl)|(fl-oz)){1}") | |
lb_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-]){1}((lb)){1}") | |
oz_lb_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-])?oz to ([0-9]+(\.[0-9]+)?([ ]|[-])?((lb)|(LB)))") | |
ct_rgx = re.compile(r"[0-9]+([ ]|[-])(ct)") | |
quart_rgx = re.compile(r"[0-9]+([ ]|[-])+((qt)|(Qt)|(quart)|(Quart))") | |
try: | |
if oz_range_rgx.search(self.description): | |
oz_range = oz_range_rgx.search(self.description).group() | |
m = re.search(r" to ", oz_range) | |
return { | |
'unit_type' : 'oz_range', | |
'val' : None, | |
'oz_range_lo' : oz_range[:m.start()], | |
'oz_range_hi' : oz_range[m.end():].lower().replace('-oz', '') | |
} | |
# oz. flat rate | |
elif oz_flat_rgx.search(self.description): | |
return { | |
'unit_type' : 'oz', | |
'val' : int(re.search(r"[0-9]+",oz_flat_rgx.search(self.description).group()).group()), | |
} | |
# oz. -> lb. rate | |
elif oz_lb_range_rgx.search(self.description): | |
oz_lb_range = oz_lb_range_rgx.search(self.description).group() | |
m = re.search(r" to ", oz_lb_range) | |
return { | |
'unit_type' : 'oz_lb_range', | |
'val' : None, | |
'oz_lb_range_oz' : oz_lb_range[:m.start()], | |
'oz_lb_range_lb' : oz_lb_range[m.end():], | |
} | |
# flat lb. weight | |
elif lb_flat_rgx.search(record['description']): | |
return { | |
'unit_type' : 'lb', | |
'val' : int(re.search(r"[0-9]+", lb_flat_rgx.search(self.description).group()).group()), | |
} | |
# Number count | |
elif ct_rgx.search(self.description): | |
ct_numb = int(re.search(r"[0-9]+", ct_rgx.search(self.description).group()).group()) | |
return { | |
'unit_type' : 'count', | |
'val' : ct_numb, | |
} | |
# quart volume | |
elif quart_rgx.search(record['description']): | |
return { | |
'unit_type' : 'quart', | |
'val' : quart_rgx.search(self.description).group(), | |
} | |
# no valid unit information: | |
else: | |
empty_count += 1 | |
return { | |
'unit_type' : None, | |
'val' : None, | |
} | |
except: | |
return { | |
'unit_type' : None, | |
'val' : None, | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Thaks to Jart for substantially building the boilerplate for this implementation: | |
# | |
# redisbayes - Na•ve Bayesian Text Classifier on Redis | |
# Copyright (c) 2012 Justine Alexandra Roberts Tunney | |
# | |
# Permission is hereby granted, free of charge, to any person | |
# obtaining a copy of this software and associated documentation | |
# files (the "Software"), to deal in the Software without | |
# restriction, including without limitation the rights to use, copy, | |
# modify, merge, publish, distribute, sublicense, and/or sell copies | |
# of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be | |
# included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
# | |
import re, math, logging | |
import logging | |
logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG) | |
__version__ = "0.1.3" | |
english_ignore = set(""" | |
a able about above abroad according accordingly across actually adj after | |
afterwards again against ago ahead ain't all allow allows almost alone along | |
alongside already also although always am amid amidst among amongst an and | |
another any anybody anyhow anyone anything anyway anyways anywhere apart | |
appear appreciate appropriate are aren't around as a's aside ask asking | |
associated at available away awfully b back backward backwards be became | |
because become becomes becoming been before beforehand begin behind being | |
believe below beside besides best better between beyond both brief but by c | |
came can cannot cant can't caption cause causes certain certainly changes | |
clearly c'mon co co. com come comes concerning consequently consider | |
considering contain containing contains corresponding could couldn't course | |
c's currently d dare daren't definitely described despite did didn't different | |
directly do does doesn't doing done don't down downwards during e each edu eg | |
eight eighty either else elsewhere end ending enough entirely especially et | |
etc even ever evermore every everybody everyone everything everywhere ex | |
exactly example except f fairly far farther few fewer fifth first five | |
followed following follows for forever former formerly forth forward found | |
four from further furthermore g get gets getting given gives go goes going | |
gone got gotten greetings h had hadn't half happens hardly has hasn't have | |
haven't having he he'd he'll hello help hence her here hereafter hereby herein | |
here's hereupon hers herself he's hi him himself his hither hopefully how | |
howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc | |
inc. indeed indicate indicated indicates inner inside insofar instead into | |
inward is isn't it it'd it'll its it's itself i've j just k keep keeps kept | |
know known knows l last lately later latter latterly least less lest let let's | |
like liked likely likewise little look looking looks low lower ltd m made | |
mainly make makes many may maybe mayn't me mean meantime meanwhile merely | |
might mightn't mine minus miss more moreover most mostly mr mrs much must | |
mustn't my myself n name namely nd near nearly necessary need needn't needs | |
neither never neverf neverless nevertheless new next nine ninety no nobody non | |
none nonetheless noone no-one nor normally not nothing notwithstanding novel | |
now nowhere o obviously of off often oh ok okay old on once one ones one's | |
only onto opposite or other others otherwise ought oughtn't our ours ourselves | |
out outside over overall own p particular particularly past per perhaps placed | |
please plus possible presumably probably provided provides q que quite qv r | |
rather rd re really reasonably recent recently regarding regardless regards | |
relatively respectively right round s said same saw say saying says second | |
secondly see seeing seem seemed seeming seems seen self selves sensible sent | |
serious seriously seven several shall shan't she she'd she'll she's should | |
shouldn't since six so some somebody someday somehow someone something | |
sometime sometimes somewhat somewhere soon sorry specified specify specifying | |
still sub such sup sure t take taken taking tell tends th than thank thanks | |
thanx that that'll thats that's that've the their theirs them themselves then | |
thence there thereafter thereby there'd therefore therein there'll there're | |
theres there's thereupon there've these they they'd they'll they're they've | |
thing things think third thirty this thorough thoroughly those though three | |
through throughout thru thus till to together too took toward towards tried | |
tries truly try trying t's twice two u un under underneath undoing | |
unfortunately unless unlike unlikely until unto up upon upwards us use used | |
useful uses using usually v value various versus very via viz vs w want wants | |
was wasn't way we we'd welcome well we'll went were we're weren't we've what | |
whatever what'll what's what've when whence whenever where whereafter whereas | |
whereby wherein where's whereupon wherever whether which whichever while | |
whilst whither who who'd whoever whole who'll whom whomever who's whose why | |
will willing wish with within without wonder won't would wouldn't x y yes yet | |
you you'd you'll your you're yours yourself yourselves you've z zero | |
successful greatest began including being all for close but | |
""".split()) | |
def tidy(text): | |
logging.debug('>>>tidy>>>') | |
logging.debug(text) | |
logging.debug('<<<tidy<<<') | |
if not isinstance(text, basestring): | |
text = str(text) | |
if not isinstance(text, unicode): | |
text = text.decode('utf8') | |
text = text.lower() | |
return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE) | |
def english_tokenizer(text): | |
words = tidy(text).split() | |
return [w for w in words if len(w) > 2 and w not in english_ignore] | |
def occurances(words): | |
counts = {} | |
for word in words: | |
if word in counts: | |
counts[word] += 1 | |
else: | |
counts[word] = 1 | |
return counts | |
class RedisBayes(object): | |
def __init__(self, redis=None, prefix='clg:', correction=0.1, | |
tokenizer=None): | |
self.redis = redis | |
self.prefix = prefix | |
self.correction = correction | |
self.tokenizer = tokenizer or english_tokenizer | |
if not self.redis: | |
from redis import Redis | |
self.redis = Redis() | |
def flush_all(self): | |
for cat in self.redis.smembers(self.prefix + 'categories'): | |
self.redis.delete(self.prefix + cat) | |
self.redis.delete(self.prefix + 'categories') | |
# Given the title of a canonical item, flush all keywords that reference it | |
def flush_item(self, item): | |
pass | |
# Given a category name, flush all associated items | |
def flush_category(self, cat): | |
pass | |
def flush_item(self, item): | |
pass | |
def item_add_kws(self, pk, text): | |
tokens = self.tokenizer(text) | |
logging.debug('>>>tokens>>>') | |
logging.debug(tokens) | |
bigrams = self._get_bigrams(tokens) | |
logging.debug('>>>bigrams>>>') | |
logging.debug(bigrams) | |
for token in tokens: | |
logging.debug('token set name: ' + self.prefix+str(pk)) | |
logging.debug('token Insert result: ') | |
logging.debug(self.redis.sadd(self.prefix + str(pk), token)) | |
for bigram in bigrams: | |
logging.debug('bigram set name: ' + self.prefix+str(pk)) | |
logging.debug('bigram Insert result: ') | |
logging.debug(self.redis.sadd(self.prefix + str(pk), bigram)) | |
def _get_bigrams(self,tokens): | |
bigrams = [] | |
i=0 | |
while i < (len(tokens)-1): | |
bigrams.append(tokens[i] + ' ' + tokens[i+1]) | |
i += 1 | |
return bigrams | |
# Associate keywords with an item | |
def associate_item(self, item, **kws): | |
pass | |
# Associate keywords with a category | |
def associate_category(self, category, **kws): | |
pass | |
def score(self, text): | |
occurs = occurances(self.tokenizer(text)) | |
scores = {} | |
for category in self.redis.smembers(self.prefix + 'categories'): | |
tally = self.tally(category) | |
if tally == 0: | |
continue | |
scores[category] = 0.0 | |
for word, count in occurs.iteritems(): | |
score = self.redis.hget(self.prefix + category, word) | |
assert not score or score > 0, "corrupt bayesian database" | |
score = score or self.correction | |
scores[category] += math.log(float(score) / tally) | |
return scores | |
def tally(self, category): | |
tally = sum(int(x) for x in self.redis.hvals(self.prefix + category)) | |
assert tally >= 0, "corrupt bayesian database" | |
return tally | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment