Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Last active August 29, 2015 13:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bbengfort/9327673 to your computer and use it in GitHub Desktop.
Save bbengfort/9327673 to your computer and use it in GitHub Desktop.
PSQL queries with psycopg2 to search for incorrect age-labeled data in the CPS. Compares the total matches for the pattern with the total number incorrectly marked as ADULT.
#!/usr/bin/env python
# baby_search
# Executes regular expression queries on CPS
#
# Author: Benjamin Bengfort <benjamin@bengfort.com>
# Created: Mon Mar 03 09:56:09 2014 -0500
# Requires: psycopg2
#
# ID: baby_search.py [] benjamin@bengfort.com $
"""
Executes regular expression queries on CPS
"""
##########################################################################
## Imports
##########################################################################
import time
import psycopg2
##########################################################################
## Queries
##########################################################################
patterns = (
'Onesie',
'Baby Boys',
'Little Boys',
'Little Boys & Big Boys',
'Boys',
'Big Boy',
'baby',
'Girl Juniors',
'Girls',
'Little Girls & Big Girls',
'Little Girls',
'Baby Girls',
'Girls',
'Baby Girls',
'Girl Juniors',
'Girls',
'Kids',
'Nail Polish',
'Notepad',
'Studs',
'Beanie',
'Figurine',
'toys',
'iphone',
)
COUNT = "SELECT count(id) FROM product WHERE name ilike %s"
FILTER = COUNT + " AND age='ADULT';"
COUNT += ";"
def connect(**kwargs):
defaults = {
'database': 'cps',
'user': '',
'password': '',
'host': 'localhost',
'port': 5432
}
defaults.update(kwargs)
conn = psycopg2.connect(**defaults)
return conn.cursor(), conn
def execute_query(pattern, cursor=None, conn=None,
cleanup=False, **kwargs):
start = time.time()
if cursor is None:
cursor, conn = connect(**kwargs)
cleanup = True
result = {"count": None, "filter": None}
query = "%%%s%%" % pattern
cursor.execute(COUNT, (query,))
result['count'] = cursor.fetchone()[0]
cursor.execute(FILTER, (query,))
result['filter'] = cursor.fetchone()[0]
if cleanup:
cursor.close()
conn.close()
finit = time.time()
result['time'] = "%0.3f" % (finit - start)
result['correct'] = "%0.2f%%" % ((1-(float(result['filter'])/result['count'])) * 100)
return result
def tableize(data, headers=(), pad=4):
"""
Prints out a table. Data should be a dict of dicts- the row labels
are the keys of the data dict, the column headers are the keys of the
values of data.
"""
if not headers:
headers = []
for d in data.values():
headers.extend(d.keys())
headers = set(headers)
# Build format by inspecting length of strings
rowfmt = ""
rowfmt += "{:>%i}" % (max(len(s) for s in data)+ pad) # Row labels
# Column labels
for key in headers:
rowfmt += "{:>%i}" % (max(len(str(s[key])) for s in data.values()) + pad)
output = []
# Deal with Header Row
hdrrow = ["",]
hdrrow.extend(list(headers))
output.append(rowfmt.format(*hdrrow))
# Deal with each row in data
for key, val in data.iteritems():
row = [key,]
row.extend([val[col] for col in headers])
output.append(rowfmt.format(*row))
# Return the data
return "\n".join(output)
##########################################################################
## Main method
##########################################################################
def main(user="cobrain", passw=""):
"""
Execute all queries and build a table.
"""
start = time.time()
cursor, conn = connect( database="cps", user=user, password=passw,
host="localhost", port=5432 )
results = {}
for pattern in patterns:
results[pattern] = execute_query(pattern, cursor)
print tableize(results)
cursor.close()
conn.close()
finit = time.time()
delta = finit - start
print
print "Query took %0.3f seconds" % delta
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment