Last active
August 29, 2015 13:56
-
-
Save bbengfort/9327673 to your computer and use it in GitHub Desktop.
PSQL queries with psycopg2 to search for incorrect age-labeled data in the CPS. Compares the total matches for the pattern with the total number incorrectly marked as ADULT.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# baby_search | |
# Executes regular expression queries on CPS | |
# | |
# Author: Benjamin Bengfort <benjamin@bengfort.com> | |
# Created: Mon Mar 03 09:56:09 2014 -0500 | |
# Requires: psycopg2 | |
# | |
# ID: baby_search.py [] benjamin@bengfort.com $ | |
""" | |
Executes regular expression queries on CPS | |
""" | |
########################################################################## | |
## Imports | |
########################################################################## | |
import time | |
import psycopg2 | |
########################################################################## | |
## Queries | |
########################################################################## | |
patterns = ( | |
'Onesie', | |
'Baby Boys', | |
'Little Boys', | |
'Little Boys & Big Boys', | |
'Boys', | |
'Big Boy', | |
'baby', | |
'Girl Juniors', | |
'Girls', | |
'Little Girls & Big Girls', | |
'Little Girls', | |
'Baby Girls', | |
'Girls', | |
'Baby Girls', | |
'Girl Juniors', | |
'Girls', | |
'Kids', | |
'Nail Polish', | |
'Notepad', | |
'Studs', | |
'Beanie', | |
'Figurine', | |
'toys', | |
'iphone', | |
) | |
COUNT = "SELECT count(id) FROM product WHERE name ilike %s" | |
FILTER = COUNT + " AND age='ADULT';" | |
COUNT += ";" | |
def connect(**kwargs): | |
defaults = { | |
'database': 'cps', | |
'user': '', | |
'password': '', | |
'host': 'localhost', | |
'port': 5432 | |
} | |
defaults.update(kwargs) | |
conn = psycopg2.connect(**defaults) | |
return conn.cursor(), conn | |
def execute_query(pattern, cursor=None, conn=None, | |
cleanup=False, **kwargs): | |
start = time.time() | |
if cursor is None: | |
cursor, conn = connect(**kwargs) | |
cleanup = True | |
result = {"count": None, "filter": None} | |
query = "%%%s%%" % pattern | |
cursor.execute(COUNT, (query,)) | |
result['count'] = cursor.fetchone()[0] | |
cursor.execute(FILTER, (query,)) | |
result['filter'] = cursor.fetchone()[0] | |
if cleanup: | |
cursor.close() | |
conn.close() | |
finit = time.time() | |
result['time'] = "%0.3f" % (finit - start) | |
result['correct'] = "%0.2f%%" % ((1-(float(result['filter'])/result['count'])) * 100) | |
return result | |
def tableize(data, headers=(), pad=4): | |
""" | |
Prints out a table. Data should be a dict of dicts- the row labels | |
are the keys of the data dict, the column headers are the keys of the | |
values of data. | |
""" | |
if not headers: | |
headers = [] | |
for d in data.values(): | |
headers.extend(d.keys()) | |
headers = set(headers) | |
# Build format by inspecting length of strings | |
rowfmt = "" | |
rowfmt += "{:>%i}" % (max(len(s) for s in data)+ pad) # Row labels | |
# Column labels | |
for key in headers: | |
rowfmt += "{:>%i}" % (max(len(str(s[key])) for s in data.values()) + pad) | |
output = [] | |
# Deal with Header Row | |
hdrrow = ["",] | |
hdrrow.extend(list(headers)) | |
output.append(rowfmt.format(*hdrrow)) | |
# Deal with each row in data | |
for key, val in data.iteritems(): | |
row = [key,] | |
row.extend([val[col] for col in headers]) | |
output.append(rowfmt.format(*row)) | |
# Return the data | |
return "\n".join(output) | |
########################################################################## | |
## Main method | |
########################################################################## | |
def main(user="cobrain", passw=""): | |
""" | |
Execute all queries and build a table. | |
""" | |
start = time.time() | |
cursor, conn = connect( database="cps", user=user, password=passw, | |
host="localhost", port=5432 ) | |
results = {} | |
for pattern in patterns: | |
results[pattern] = execute_query(pattern, cursor) | |
print tableize(results) | |
cursor.close() | |
conn.close() | |
finit = time.time() | |
delta = finit - start | |
print "Query took %0.3f seconds" % delta | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment