Last active
August 29, 2015 14:07
-
-
Save arq5x/5fb70f7e942bbc7126c1 to your computer and use it in GitHub Desktop.
gemini database design experiment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# update things | |
sudo apt-get update | |
# get postgres up and running | |
sudo apt-get install postgresql-9.3 | |
sudo apt-get install postgresql-server-dev-9.3 | |
sudo apt-get install postgresql-client | |
sudo apt-get install postgresql postgresql-contrib | |
sudo -u postgres psql postgres | |
\password postgres | |
(######) | |
# grab pip and python-dev (req'd for psycopg2) | |
sudo apt-get install python-pip | |
sudo apt-get install libpq-dev python-dev | |
# need psycopg2 to connect to PGSQL and numpy to simulate genotypes | |
sudo pip install psycopg2 | |
sudo pip install numpy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
import zlib | |
import cPickle | |
import numpy | |
import sys | |
import os | |
import argparse | |
def unpack_genotype_blob(blob): | |
return numpy.array(cPickle.loads(zlib.decompress(blob))) | |
def connect_to_dbs(args): | |
db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db" | |
db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db" | |
norm_conn = sqlite3.connect(db_norm) | |
comp_conn = sqlite3.connect(db_comp) | |
norm_conn.row_factory = sqlite3.Row | |
comp_conn.row_factory = sqlite3.Row | |
norm_cursor = norm_conn.cursor() | |
comp_cursor = comp_conn.cursor() | |
return norm_cursor, comp_cursor | |
def main(): | |
parser = argparse.ArgumentParser(prog='query', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-n', | |
dest='num_variants', | |
type=int, | |
metavar='INTEGER', | |
help='The number of variants') | |
parser.add_argument('-i', | |
dest='num_indivs', | |
type=int, | |
metavar='INTEGER', | |
help='The number of individuals') | |
parser.add_argument('-s', | |
dest='name_stub', | |
type=str, | |
metavar='STRING', | |
help='Stub for the file names') | |
args = parser.parse_args() | |
norm_cursor, comp_cursor = connect_to_dbs(args) | |
# header | |
print '\t'.join(["strategy", "exp", "matches", "time"]) | |
# time a query to thhe normalized database | |
import timeit | |
start_time = timeit.default_timer() | |
num_matches = 0 | |
for row in norm_cursor.execute("""SELECT distinct v.v_id FROM variants v, genotypes g | |
WHERE v.v_id = g.v_id | |
AND g.gt = 1 | |
AND g.s_id = 7"""): | |
num_matches +=1 | |
elapsed = timeit.default_timer() - start_time | |
print '\t'.join(["norm", "one_ind", str(num_matches), str(elapsed)]) | |
# time the identical query to thhe compressed database | |
start_time = timeit.default_timer() | |
num_matches = 0 | |
for row in comp_cursor.execute("""SELECT * FROM variants"""): | |
gts = unpack_genotype_blob(row['gts']) | |
if gts[6] == 1: | |
num_matches +=1 | |
elapsed = timeit.default_timer() - start_time | |
print '\t'.join(["comp", "one_ind", str(num_matches), str(elapsed)]) | |
if __name__ == "__main__": | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create normalized and compressed (existing strategy) databases simulating 100000 variants and 100 samples | |
python simulation.py -n 100000 -i 100 -s test | |
# ls -1 | |
test.norm.v100000.i100.db | |
test.comp.v100000.i100.db | |
# test the run times for identifying rows where a single individual has a specific genotype. | |
python querying.py -n 100000 -i 100 -s test | |
strategy exp matches time | |
norm one_ind 24982 0.16601395607 | |
comp one_ind 24982 2.40940999985 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import psycopg2 | |
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT | |
import numpy | |
import sys | |
import os | |
import argparse | |
import timeit | |
# python simulation-postgres.py -n 1000000 -i 100 -s test | |
""" | |
Note: you must have a PostreSQL server running. For OSX (other POSIX?) | |
start: | |
pg_ctl -D /usr/local/var/postgres -l /usr/local/var/postgres/server.log start | |
stop: | |
pg_ctl -D /usr/local/var/postgres stop -s -m fast | |
""" | |
def create_tables(cursor): | |
cursor.execute('''drop table if exists variants''') | |
cursor.execute('''create table if not exists variants ( \ | |
v_id integer, \ | |
chrom text, \ | |
start integer, \ | |
gts integer[], \ | |
PRIMARY KEY(v_id))''') | |
def create_indexes(cursor): | |
cursor.execute('''create index vid_idx on variants using GIN (gts)''') | |
def insert_variation(cursor, buffer): | |
cursor.execute("BEGIN TRANSACTION") | |
cursor.executemany('insert into variants values (%s,%s,%s,%s)', buffer) | |
cursor.execute("END TRANSACTION") | |
def connect_to_dbs(args): | |
conn = psycopg2.connect(dbname='postgres', host='localhost') | |
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) | |
curr = conn.cursor() | |
curr.execute('DROP DATABASE IF EXISTS ' + args.name_stub) | |
curr.execute('CREATE DATABASE ' + args.name_stub) | |
create_tables(curr) | |
return conn, curr | |
def main(): | |
parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-n', | |
dest='num_variants', | |
type=int, | |
metavar='INTEGER', | |
help='The number of variants') | |
parser.add_argument('-i', | |
dest='num_indivs', | |
type=int, | |
metavar='INTEGER', | |
help='The number of individuals') | |
parser.add_argument('-s', | |
dest='name_stub', | |
type=str, | |
metavar='STRING', | |
help='Stub for the file names') | |
args = parser.parse_args() | |
conn, cur = connect_to_dbs(args) | |
var_arry_buffer = [] | |
num_variants = args.num_variants | |
num_samples = args.num_indivs | |
buffer_size = 0 | |
# Load the database with simulated variants and genotypes. | |
start_time = timeit.default_timer() | |
for v_id in range(num_variants): | |
# simulate num_samples genotypes for this variant | |
genotypes = list(numpy.random.randint(4, size=num_samples)) | |
# add the variant and genotypes to the buffers. | |
var_arry_buffer.append((v_id+1, 'chr1', v_id+1, genotypes)) | |
buffer_size += 1 | |
if buffer_size % 100000 == 0: | |
print "inserting 100000 variants" | |
insert_variation(cur, var_arry_buffer) | |
# reset | |
var_arry_buffer = [] | |
insert_variation(cur, var_arry_buffer) | |
create_indexes(cur) | |
conn.commit() | |
elapsed = timeit.default_timer() - start_time | |
print "Loading and indexing:" + str(elapsed) | |
# Test the query time. | |
start_time = timeit.default_timer() | |
hits = 0 | |
cur.execute("""SELECT * FROM variants | |
WHERE gts[7] =1 | |
AND gts[9] =0 | |
AND gts[17] =2""") | |
for row in cur: | |
hits += 1 | |
#print row[0], row[3][6], row[3][8], row[3][16] | |
elapsed = timeit.default_timer() - start_time | |
print "Querying on 3 specific genotypes:" + str(elapsed) | |
if __name__ == "__main__": | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
import zlib | |
import cPickle | |
import numpy | |
import sys | |
import os | |
import argparse | |
def create_tables(cursor, db_type): | |
if db_type == "norm": | |
cursor.execute('''create table if not exists variants ( \ | |
v_id integer, \ | |
chrom text, \ | |
start integer, \ | |
PRIMARY KEY(v_id ASC))''') | |
cursor.execute('''create table if not exists genotypes ( \ | |
v_id integer, \ | |
s_id integer, \ | |
gt integer, \ | |
PRIMARY KEY(v_id, s_id ASC))''') | |
elif db_type == "comp": | |
cursor.execute('''create table if not exists variants ( \ | |
v_id integer, \ | |
chrom text, \ | |
start integer, \ | |
gts BLOB, \ | |
PRIMARY KEY(v_id ASC))''') | |
def create_indexes(cursor, db_type): | |
cursor.execute('''create index vid_idx on variants(v_id)''') | |
if db_type == "norm": | |
cursor.execute('''create index vid_idx2 on genotypes(v_id)''') | |
cursor.execute('''create index sid_idx on genotypes(s_id, gt)''') | |
def insert_variation(cursor, buffer, db_type): | |
if db_type == "norm": | |
cursor.execute("BEGIN TRANSACTION") | |
cursor.executemany('insert into variants values (?,?,?)', buffer) | |
cursor.execute("END TRANSACTION") | |
elif db_type == "comp": | |
cursor.execute("BEGIN TRANSACTION") | |
cursor.executemany('insert into variants values (?,?,?,?)', buffer) | |
cursor.execute("END TRANSACTION") | |
def insert_genotypes(cursor, buffer, db_type): | |
if db_type == "norm": | |
cursor.execute("BEGIN TRANSACTION") | |
cursor.executemany('insert into genotypes values (?,?,?)', buffer) | |
cursor.execute("END TRANSACTION") | |
def zdumps(obj): | |
return zlib.compress(cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL), 9) | |
def pack_blob(obj): | |
return sqlite3.Binary(zdumps(obj)) | |
def connect_to_dbs(args): | |
# open up a new database | |
db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db" | |
db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db" | |
if os.path.exists(db_norm): | |
os.remove(db_norm) | |
if os.path.exists(db_comp): | |
os.remove(db_comp) | |
norm_conn = sqlite3.connect(db_norm) | |
comp_conn = sqlite3.connect(db_comp) | |
norm_conn.isolation_level = None | |
comp_conn.isolation_level = None | |
norm_cursor = norm_conn.cursor() | |
comp_cursor = comp_conn.cursor() | |
norm_cursor.execute('PRAGMA synchronous = OFF') | |
norm_cursor.execute('PRAGMA journal_mode=MEMORY') | |
comp_cursor.execute('PRAGMA synchronous = OFF') | |
comp_cursor.execute('PRAGMA journal_mode=MEMORY') | |
# create the gemini database tables for the new DBs | |
create_tables(norm_cursor, "norm") | |
create_tables(comp_cursor, "comp") | |
return norm_cursor, comp_cursor | |
def main(): | |
parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-n', | |
dest='num_variants', | |
type=int, | |
metavar='INTEGER', | |
help='The number of variants') | |
parser.add_argument('-i', | |
dest='num_indivs', | |
type=int, | |
metavar='INTEGER', | |
help='The number of individuals') | |
parser.add_argument('-s', | |
dest='name_stub', | |
type=str, | |
metavar='STRING', | |
help='Stub for the file names') | |
args = parser.parse_args() | |
norm_cursor, comp_cursor = connect_to_dbs(args) | |
var_norm_buffer = [] | |
var_comp_buffer = [] | |
gts_buffer = [] | |
num_variants = args.num_variants | |
num_samples = args.num_indivs | |
buffer_size = 0 | |
for v_id in range(num_variants): | |
# simulate num_samples genotypes for this variant | |
genotypes = numpy.random.randint(4, size=num_samples) | |
# add the variant and genotypes to the buffers. | |
var_norm_buffer.append([v_id+1, 'chr1', v_id+1]) | |
var_comp_buffer.append([v_id+1, 'chr1', v_id+1, pack_blob(genotypes)]) | |
for idx, gt in enumerate(genotypes): | |
gts_buffer.append([v_id+1, idx+1, int(gt)]) | |
buffer_size += 1 | |
if buffer_size % 10000 == 0: | |
print "inserting 10000 variants" | |
# add the variants to the "normalized database" | |
insert_variation(norm_cursor, var_norm_buffer, "norm") | |
insert_genotypes(norm_cursor, gts_buffer, "norm") | |
# add the variants to the "compressed database" | |
insert_variation(comp_cursor, var_comp_buffer, "comp") | |
# reset | |
var_norm_buffer = [] | |
var_comp_buffer = [] | |
gts_buffer = [] | |
create_indexes(norm_cursor, "norm") | |
create_indexes(comp_cursor, "curr") | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment