Skip to content

Instantly share code, notes, and snippets.

@starsinmypockets
Created January 30, 2013 02:26
Show Gist options
  • Save starsinmypockets/4670060 to your computer and use it in GitHub Desktop.
Save starsinmypockets/4670060 to your computer and use it in GitHub Desktop.
""" Imports json in a standard format to Django objects """
import json, re, logging, sqlite3
from redisclg import RedisBayes
logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG)
class BaseImporter(object):
# @@TODO add file_path param to init
def __init__(self, src, list_id, db_src="clg_proj.sqlite3"):
self.list_id = list_id
self.rb = RedisBayes()
try:
self.json_string = open(src, 'r').read().strip()
try:
self.json_data = json.loads(self.json_string)
self.categories = self.json_data.keys()
self.validated_items = []
except ValueError, e:
print "Error importing data.", e
except IOError:
print 'cannot open'
except ValueError,e:
print 'Error: ' + e
try:
self.conn = sqlite3.connect(db_src)
self.c = self.conn.cursor()
except e:
print 'Oops - database error: ' + e
def import_category(self, category):
for item_raw in self.json_data[category]:
clean_item = self._get_clean_item(item_raw)
pk = self._db_insert(clean_item)
text = (clean_item['title_raw'] + ' ' + clean_item['description_raw'])
self.rb.item_add_kws(pk, text)
self.conn.commit()
def import_all(self):
for category in self.categories:
self.import_category(category)
def _get_clean_item(self, item_raw):
pv = PriceValidator(item_raw)
clean_item = {
'title_raw' : item_raw['title'],
'description_raw' : item_raw['description'],
'price_raw' : item_raw['price'],
'tokens' : self.rb.tokenizer(set((item_raw['title'] + item_raw['description']))),
'price_cents' : pv.get_price_in_cents(),
'unit_type' : pv.get_unit_type()['unit_type'],
'unit_val' : pv.get_unit_type()['val'],
}
try:
clean_item['unit_price'] = int(pv.get_price_in_cents())/int(pv.get_unit_type()['val'])
except:
clean_item['unit_price'] = None,
return clean_item
def _tidy(self, text):
if not isinstance(text, basestring):
text = str(text)
if not isinstance(text, unicode):
text = text.decode('utf8')
text = text.lower()
return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)
def _db_insert(self, clean_item):
try:
self.c.execute('INSERT INTO main_item ("title_raw", "description_raw", "price_raw", "price_in_cents", "unit_type", "unit_val", "product_list_id") VALUES (?, ?, ?, ?, ?, ?, ?)', (clean_item['title_raw'], clean_item['description_raw'], clean_item['price_raw'], clean_item['price_cents'], str(clean_item['unit_type']), clean_item['unit_val'], self.list_id))
except:
print 'Error inserting record: '
for item in clean_item:
print item+' => ', clean_item[item]
print '\n'
return self.c.lastrowid
class PriceValidator(object):
""" Takes a price string and validates price, unit type, and unit value. """
def __init__(self, item_raw):
try:
self.price_string = str(item_raw['price'])
self.description = str(item_raw['description'])
except ValueError, e:
print 'Sorry, only string values accepted. ', e
# @@TODO - Needs 'X FOR $Y' form
def get_price_in_cents(self):
""" Return an integer representing a price in cents """
price_string = self.price_string
price_rgx = re.compile(r"[0-9]+(\.[0-9]{2})")
try:
x = price_rgx.search(price_string).group()
return int((float(x))*100)
except:
return None
def get_unit_val(self):
pass
def get_unit_type(self):
oz_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)? to ([0-9]+(\.[0-9]+)?([ ]|[-])?(oz)?)")
oz_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-]){1}((oz)|(fl)|(fl-oz)){1}")
lb_flat_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-]){1}((lb)){1}")
oz_lb_range_rgx = re.compile(r"[0-9]+(\.[0-9]+)?([ ]|[-])?oz to ([0-9]+(\.[0-9]+)?([ ]|[-])?((lb)|(LB)))")
ct_rgx = re.compile(r"[0-9]+([ ]|[-])(ct)")
quart_rgx = re.compile(r"[0-9]+([ ]|[-])+((qt)|(Qt)|(quart)|(Quart))")
try:
if oz_range_rgx.search(self.description):
oz_range = oz_range_rgx.search(self.description).group()
m = re.search(r" to ", oz_range)
return {
'unit_type' : 'oz_range',
'val' : None,
'oz_range_lo' : oz_range[:m.start()],
'oz_range_hi' : oz_range[m.end():].lower().replace('-oz', '')
}
# oz. flat rate
elif oz_flat_rgx.search(self.description):
return {
'unit_type' : 'oz',
'val' : int(re.search(r"[0-9]+",oz_flat_rgx.search(self.description).group()).group()),
}
# oz. -> lb. rate
elif oz_lb_range_rgx.search(self.description):
oz_lb_range = oz_lb_range_rgx.search(self.description).group()
m = re.search(r" to ", oz_lb_range)
return {
'unit_type' : 'oz_lb_range',
'val' : None,
'oz_lb_range_oz' : oz_lb_range[:m.start()],
'oz_lb_range_lb' : oz_lb_range[m.end():],
}
# flat lb. weight
elif lb_flat_rgx.search(record['description']):
return {
'unit_type' : 'lb',
'val' : int(re.search(r"[0-9]+", lb_flat_rgx.search(self.description).group()).group()),
}
# Number count
elif ct_rgx.search(self.description):
ct_numb = int(re.search(r"[0-9]+", ct_rgx.search(self.description).group()).group())
return {
'unit_type' : 'count',
'val' : ct_numb,
}
# quart volume
elif quart_rgx.search(record['description']):
return {
'unit_type' : 'quart',
'val' : quart_rgx.search(self.description).group(),
}
# no valid unit information:
else:
empty_count += 1
return {
'unit_type' : None,
'val' : None,
}
except:
return {
'unit_type' : None,
'val' : None,
}
# -*- coding: utf-8 -*-
#
# Thaks to Jart for substantially building the boilerplate for this implementation:
#
# redisbayes - Na•ve Bayesian Text Classifier on Redis
# Copyright (c) 2012 Justine Alexandra Roberts Tunney
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import re, math, logging
import logging
logging.basicConfig(filename='/root/py-proj/clg-env/clg_proj/logfile.log',level=logging.DEBUG)
__version__ = "0.1.3"
english_ignore = set("""
a able about above abroad according accordingly across actually adj after
afterwards again against ago ahead ain't all allow allows almost alone along
alongside already also although always am amid amidst among amongst an and
another any anybody anyhow anyone anything anyway anyways anywhere apart
appear appreciate appropriate are aren't around as a's aside ask asking
associated at available away awfully b back backward backwards be became
because become becomes becoming been before beforehand begin behind being
believe below beside besides best better between beyond both brief but by c
came can cannot cant can't caption cause causes certain certainly changes
clearly c'mon co co. com come comes concerning consequently consider
considering contain containing contains corresponding could couldn't course
c's currently d dare daren't definitely described despite did didn't different
directly do does doesn't doing done don't down downwards during e each edu eg
eight eighty either else elsewhere end ending enough entirely especially et
etc even ever evermore every everybody everyone everything everywhere ex
exactly example except f fairly far farther few fewer fifth first five
followed following follows for forever former formerly forth forward found
four from further furthermore g get gets getting given gives go goes going
gone got gotten greetings h had hadn't half happens hardly has hasn't have
haven't having he he'd he'll hello help hence her here hereafter hereby herein
here's hereupon hers herself he's hi him himself his hither hopefully how
howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc
inc. indeed indicate indicated indicates inner inside insofar instead into
inward is isn't it it'd it'll its it's itself i've j just k keep keeps kept
know known knows l last lately later latter latterly least less lest let let's
like liked likely likewise little look looking looks low lower ltd m made
mainly make makes many may maybe mayn't me mean meantime meanwhile merely
might mightn't mine minus miss more moreover most mostly mr mrs much must
mustn't my myself n name namely nd near nearly necessary need needn't needs
neither never neverf neverless nevertheless new next nine ninety no nobody non
none nonetheless noone no-one nor normally not nothing notwithstanding novel
now nowhere o obviously of off often oh ok okay old on once one ones one's
only onto opposite or other others otherwise ought oughtn't our ours ourselves
out outside over overall own p particular particularly past per perhaps placed
please plus possible presumably probably provided provides q que quite qv r
rather rd re really reasonably recent recently regarding regardless regards
relatively respectively right round s said same saw say saying says second
secondly see seeing seem seemed seeming seems seen self selves sensible sent
serious seriously seven several shall shan't she she'd she'll she's should
shouldn't since six so some somebody someday somehow someone something
sometime sometimes somewhat somewhere soon sorry specified specify specifying
still sub such sup sure t take taken taking tell tends th than thank thanks
thanx that that'll thats that's that've the their theirs them themselves then
thence there thereafter thereby there'd therefore therein there'll there're
theres there's thereupon there've these they they'd they'll they're they've
thing things think third thirty this thorough thoroughly those though three
through throughout thru thus till to together too took toward towards tried
tries truly try trying t's twice two u un under underneath undoing
unfortunately unless unlike unlikely until unto up upon upwards us use used
useful uses using usually v value various versus very via viz vs w want wants
was wasn't way we we'd welcome well we'll went were we're weren't we've what
whatever what'll what's what've when whence whenever where whereafter whereas
whereby wherein where's whereupon wherever whether which whichever while
whilst whither who who'd whoever whole who'll whom whomever who's whose why
will willing wish with within without wonder won't would wouldn't x y yes yet
you you'd you'll your you're yours yourself yourselves you've z zero
successful greatest began including being all for close but
""".split())
def tidy(text):
logging.debug('>>>tidy>>>')
logging.debug(text)
logging.debug('<<<tidy<<<')
if not isinstance(text, basestring):
text = str(text)
if not isinstance(text, unicode):
text = text.decode('utf8')
text = text.lower()
return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)
def english_tokenizer(text):
words = tidy(text).split()
return [w for w in words if len(w) > 2 and w not in english_ignore]
def occurances(words):
counts = {}
for word in words:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
return counts
class RedisBayes(object):
def __init__(self, redis=None, prefix='clg:', correction=0.1,
tokenizer=None):
self.redis = redis
self.prefix = prefix
self.correction = correction
self.tokenizer = tokenizer or english_tokenizer
if not self.redis:
from redis import Redis
self.redis = Redis()
def flush_all(self):
for cat in self.redis.smembers(self.prefix + 'categories'):
self.redis.delete(self.prefix + cat)
self.redis.delete(self.prefix + 'categories')
# Given the title of a canonical item, flush all keywords that reference it
def flush_item(self, item):
pass
# Given a category name, flush all associated items
def flush_category(self, cat):
pass
def flush_item(self, item):
pass
def item_add_kws(self, pk, text):
tokens = self.tokenizer(text)
logging.debug('>>>tokens>>>')
logging.debug(tokens)
bigrams = self._get_bigrams(tokens)
logging.debug('>>>bigrams>>>')
logging.debug(bigrams)
for token in tokens:
logging.debug('token set name: ' + self.prefix+str(pk))
logging.debug('token Insert result: ')
logging.debug(self.redis.sadd(self.prefix + str(pk), token))
for bigram in bigrams:
logging.debug('bigram set name: ' + self.prefix+str(pk))
logging.debug('bigram Insert result: ')
logging.debug(self.redis.sadd(self.prefix + str(pk), bigram))
def _get_bigrams(self,tokens):
bigrams = []
i=0
while i < (len(tokens)-1):
bigrams.append(tokens[i] + ' ' + tokens[i+1])
i += 1
return bigrams
# Associate keywords with an item
def associate_item(self, item, **kws):
pass
# Associate keywords with a category
def associate_category(self, category, **kws):
pass
def score(self, text):
occurs = occurances(self.tokenizer(text))
scores = {}
for category in self.redis.smembers(self.prefix + 'categories'):
tally = self.tally(category)
if tally == 0:
continue
scores[category] = 0.0
for word, count in occurs.iteritems():
score = self.redis.hget(self.prefix + category, word)
assert not score or score > 0, "corrupt bayesian database"
score = score or self.correction
scores[category] += math.log(float(score) / tally)
return scores
def tally(self, category):
tally = sum(int(x) for x in self.redis.hvals(self.prefix + category))
assert tally >= 0, "corrupt bayesian database"
return tally
if __name__ == '__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment