Skip to content

Instantly share code, notes, and snippets.

@squirrelo
Created October 12, 2015 06:41
Show Gist options
  • Save squirrelo/914178385508650af222 to your computer and use it in GitHub Desktop.
Save squirrelo/914178385508650af222 to your computer and use it in GitHub Desktop.
Scrubber for AG database pulldowns
import psycopg2
from passlib.hash import bcrypt
from random import choice
from sys import argv
def _new_kit_id():
seen = set()
alpha = "abcdefghijklmnopqrstuvwxyz"
alpha += alpha.upper()
while True:
suffix = ''.join([choice(alpha) for i in range(5)])
if suffix in seen:
continue
seen.add(suffix)
yield "tst_%s" % suffix
def ag_settings(cur):
cur.execute("UPDATE ag.settings SET test_environment = 'true';")
def ag_labadmin(cur):
"""Wipes out all current labadmin users and replaces with
username: test
password: password
"""
cur.execute("DELETE FROM ag.labadmin_users")
cur.execute("""INSERT INTO ag.labadmin_users (email, password)
VALUES ('test', '$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4VqbhLMQK1xP67Aj3rei86')""")
def survey_question_ids(cur):
"""Clean the data stored for survey answers"""
FREE_IDs = (98, 99, 101, 103, 104, 105, 106, 115, 116, 117, 118, 119, 120,
122, 124, 126, 127, 142, 143, 144)
# take special care for birth month: Set all to January so FK & pulldown doesnt break
cur.execute("""UPDATE ag.survey_answers
SET response = %s
WHERE survey_question_id = 111""", ["January"])
# Wipe out answers
cur.execute("""UPDATE ag.survey_answers_other
SET response = %s
WHERE survey_question_id IN %s""", ["REMOVED", FREE_IDs])
def ag_kit(cur):
"""Clean the data stored in ag_kit
The specific fields scrubbed are 'supplied_kit_id', and 'kit_password'.
All 'kit_password' fields will be set to a valid password hash where the
password is "test". All kit IDs will be reset to "tst_<something>"
where <something> is a random string of characters.
"""
pass_hash = bcrypt.encrypt('test')
kit_id_gen = _new_kit_id()
cur.execute("UPDATE ag.ag_kit SET kit_password = %s", [pass_hash])
cur.execute("SELECT supplied_kit_id FROM ag.ag_kit")
for kit_id in cur.fetchall():
new_kit_id = kit_id_gen.next()
cur.execute("""UPDATE ag.ag_kit
SET supplied_kit_id = %s
WHERE supplied_kit_id = %s""", [new_kit_id, kit_id])
def ag_login(cur):
"""Clean the data stored in ag_login
Basically remove all information from this table. Email, name, address,
city, state, zip, country are set to REMOVED while latitude, longitude,
and elevation are set to 0.0
"""
varchar_fields = ['email', 'name', 'address', 'city', 'state', 'zip',
'country']
float_fields = ['latitude', 'longitude', 'elevation']
for field in varchar_fields:
cur.execute("UPDATE ag.ag_login SET %s = 'REMOVED'" % field)
for field in float_fields:
cur.execute("UPDATE ag.ag_login SET %s = 0.0" % field)
def ag_handout_kits(cur):
"""Clean handout kits"""
pass_hash = bcrypt.encrypt('test')
kit_id_gen = _new_kit_id()
cur.execute("UPDATE ag.ag_handout_kits SET password = %s", [pass_hash])
cur.execute("SELECT kit_id FROM ag.ag_handout_kits")
for kit_id in cur.fetchall():
new_kit_id = kit_id_gen.next()
cur.execute("""UPDATE ag.ag_handout_kits
SET kit_id = %s
WHERE kit_id = %s""", [new_kit_id, kit_id])
def ag_kit_barcodes(cur):
"""Clean the data in ag_kit_barcodes"""
fields = ['participant_name', 'notes', 'other_text']
cur.execute("""UPDATE ag.ag_kit_barcodes
SET (participant_name,notes,other_text) =
(%s, %s, %s)""", ["REMOVED", "REMOVED", "REMOVED"])
def ag_login_surveys(cur):
"""Scrub the ag_login_surveys table"""
cur.execute("ALTER TABLE ag.ag_login_surveys DROP CONSTRAINT fk_ag_login_surveys0")
cur.execute("ALTER TABLE ag.ag_consent DROP CONSTRAINT pk_american_gut_consent")
id_name_map = {}
cur.execute("""SELECT ag_login_id, array_agg(participant_name)
FROM ag.ag_consent
GROUP BY ag_login_id""")
for login_id, pnames in cur.fetchall():
for idx, old_name in enumerate(pnames):
name = 'REMOVED-%d' % idx
id_name_map[(login_id, old_name)] = name
cur.execute("""UPDATE ag.ag_consent
SET participant_name = %s
WHERE ag_login_id = %s AND participant_name = %s""",
[name, login_id, old_name])
cur.execute("""SELECT ag_login_id, array_agg(survey_id) as survey_id,
array_agg(participant_name) as participant_name
FROM ag.ag_login_surveys
GROUP BY ag_login_id""")
for login_id, sids, pnames in cur.fetchall():
for idx, (survey, old_name) in enumerate(zip(sids, pnames)):
name = id_name_map[(login_id, old_name)]
cur.execute("""UPDATE ag.ag_login_surveys
SET participant_name = %s
WHERE survey_id = %s""", [name, survey])
cur.execute("""ALTER TABLE ag.ag_consent
ADD CONSTRAINT pk_american_gut_consent
PRIMARY KEY (ag_login_id, participant_name)""")
cur.execute("""ALTER TABLE ag.ag_login_surveys
ADD CONSTRAINT fk_ag_login_surveys0
FOREIGN KEY (ag_login_id, participant_name)
REFERENCES ag.ag_consent(ag_login_id, participant_name)""")
def ag_consent(cur):
"""Clean the ag_consent table"""
fields = ['participant_email', 'parent_1_name',
'parent_2_name', 'parent_1_code', 'parent_2_code']
for f in fields:
cur.execute("""UPDATE ag.ag_consent
SET (participant_email, parent_1_name,
parent_2_name, parent_1_code, parent_2_code) =
(%s, %s, %s, %s, %s)""", ["REMOVED"] * 5)
# (ag_login_id, participant_name) is not unique now due to "REMOVED"
def ag_zipcodes(cur):
cur.execute('DELETE FROM ag.zipcodes')
def ag_surveys(cur):
'''Wipe out answers but leave tables so promotion patches work'''
cur.execute('DELETE FROM ag.ag_human_survey;')
cur.execute('DELETE FROM ag.ag_animal_survey;')
def barcodes_schema(cur):
"""Rename the projects in the database and remove barcodes not in AG"""
# project_barcode MUST BE LAST
tables = ['plate_barcode', 'barcode_exceptions', 'project_barcode']
sql = '''DELETE FROM barcodes.{}
WHERE barcode NOT IN (
SELECT DISTINCT barcode FROM barcodes.project_barcode
JOIN barcodes.project USING (project_id)
WHERE project = 'American Gut Project')'''
for table in tables:
cur.execute(sql.format(table))
sql = """UPDATE barcodes.project
SET project = 'PROJECT' || project_id
WHERE project != 'American Gut Project'"""
cur.execute(sql)
def drop_tables(cur):
sql = '''DROP TABLE ag.ag_participant_exceptions;
DROP TABLE ag.ag_survey_multiples_backup;
DROP TABLE ag.ag_survey_answer;
DROP TABLE ag.ag_survey_multiples;
DROP TABLE ag.controlled_vocab_values;
DROP TABLE ag.controlled_vocabs;
DROP TABLE ag.ag_import_stats_tmp;
'''
cur.execute(sql)
def drop_procedures(cur):
sql = '''DROP FUNCTION ag.ag_add_animal_participant(uuid, text);
DROP FUNCTION ag.ag_add_participant(uuid, text);
DROP FUNCTION ag.ag_authenticate_user(text, text, refcursor);
DROP FUNCTION ag.ag_available_barcodes(uuid, refcursor);
DROP FUNCTION ag.ag_check_barcode_status(text, refcursor);
DROP FUNCTION ag.ag_delete_participant(uuid, text);
DROP FUNCTION ag.ag_delete_sample(text, uuid);
DROP FUNCTION ag.ag_delete_survey_answer(uuid, text);
DROP FUNCTION ag.ag_get_animal_participants(uuid, refcursor);
DROP FUNCTION ag.ag_get_barcode_details(text, refcursor);
DROP FUNCTION ag.ag_get_barcode_md_animal(text, refcursor);
DROP FUNCTION ag.ag_get_barcode_metadata(text, refcursor);
DROP FUNCTION ag.ag_get_barcodes(refcursor);
DROP FUNCTION ag.ag_get_barcodes_by_kit(text, refcursor);
DROP FUNCTION ag.ag_get_barcodes_by_login(uuid, refcursor);
DROP FUNCTION ag.ag_get_environmental_samples(uuid, refcursor);
DROP FUNCTION ag.ag_get_human_participants(uuid, refcursor);
DROP FUNCTION ag.ag_get_kit_details(text, refcursor);
DROP FUNCTION ag.ag_get_kit_id_by_email(text, refcursor);
DROP FUNCTION ag.ag_get_kits_by_login(refcursor);
DROP FUNCTION ag.ag_get_logins(refcursor);
DROP FUNCTION ag.ag_get_map_markers(refcursor);
DROP FUNCTION ag.ag_get_next_barcode(refcursor);
DROP FUNCTION ag.ag_get_participant_exceptions(uuid, refcursor);
DROP FUNCTION ag.ag_get_participant_samples(uuid, text, refcursor);
DROP FUNCTION ag.ag_get_print_results(text, refcursor);
DROP FUNCTION ag.ag_get_survey_details(uuid, text, refcursor);
DROP FUNCTION ag.ag_insert_barcode(uuid, text);
DROP FUNCTION ag.ag_insert_kit(uuid, text, text, bigint, text, text);
DROP FUNCTION ag.ag_insert_login(text, text, text, text, text, text, text);
DROP FUNCTION ag.ag_insert_participant_exception(uuid, text);
DROP FUNCTION ag.ag_insert_survey_answer(uuid, text, text, text);
DROP FUNCTION ag.ag_is_handout(text, text);
DROP FUNCTION ag.ag_log_participant_sample(text, text, text, text, text, text, text);
DROP FUNCTION ag.ag_reassign_barcode(uuid, text);
DROP FUNCTION ag.ag_set_pass_change_code(text, text, text);
DROP FUNCTION ag.ag_stats(refcursor);
DROP FUNCTION ag.ag_update_barcode(text, uuid, text, text, text, text, text, text, text, text);
DROP FUNCTION ag.ag_update_geo_info(uuid, double precision, double precision, double precision, character);
DROP FUNCTION ag.ag_update_kit(uuid, text, text, bigint, text);
DROP FUNCTION ag.ag_update_kit_password(text, text);
DROP FUNCTION ag.ag_update_login(uuid, text, text, text, text, text, text, text);
DROP FUNCTION ag.ag_verify_kit_status(text);
DROP FUNCTION ag.ag_verify_password_change_code(text, text, text);
DROP FUNCTION ag.american_gut_consent_submit(text, text, text, text, text, text, text, text);
DROP FUNCTION ag.get_barcode_proj_type(text, refcursor);
DROP FUNCTION ag.get_project_names(refcursor);
DROP FUNCTION ag.set_barcode_proj_type(text, text);
DROP FUNCTION ag.update_akb(text, character, character, character, text, text);
DROP FUNCTION ag.update_barcode_status(text, text, text, text);'''
cur.execute(sql)
if __name__ == '__main__':
conn = psycopg2.connect(host='localhost', dbname=argv[1])
cur = conn.cursor()
ag_settings(cur)
drop_tables(cur)
ag_surveys(cur)
drop_procedures(cur)
survey_question_ids(cur)
ag_zipcodes(cur)
ag_kit(cur)
ag_login(cur)
ag_handout_kits(cur)
ag_kit_barcodes(cur)
ag_login_surveys(cur)
ag_consent(cur)
ag_labadmin(cur)
barcodes_schema(cur)
conn.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment