Skip to content

Instantly share code, notes, and snippets.

@arq5x
Last active August 29, 2015 14:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arq5x/5fb70f7e942bbc7126c1 to your computer and use it in GitHub Desktop.
Save arq5x/5fb70f7e942bbc7126c1 to your computer and use it in GitHub Desktop.
gemini database design experiment
# update things
sudo apt-get update
# get postgres up and running
sudo apt-get install postgresql-9.3
sudo apt-get install postgresql-server-dev-9.3
sudo apt-get install postgresql-client
sudo apt-get install postgresql postgresql-contrib
sudo -u postgres psql postgres
\password postgres
(######)
# grab pip and python-dev (req'd for psycopg2)
sudo apt-get install python-pip
sudo apt-get install libpq-dev python-dev
# need psycopg2 to connect to PGSQL and numpy to simulate genotypes
sudo pip install psycopg2
sudo pip install numpy
import sqlite3
import zlib
import cPickle
import numpy
import sys
import os
import argparse
def unpack_genotype_blob(blob):
return numpy.array(cPickle.loads(zlib.decompress(blob)))
def connect_to_dbs(args):
db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
norm_conn = sqlite3.connect(db_norm)
comp_conn = sqlite3.connect(db_comp)
norm_conn.row_factory = sqlite3.Row
comp_conn.row_factory = sqlite3.Row
norm_cursor = norm_conn.cursor()
comp_cursor = comp_conn.cursor()
return norm_cursor, comp_cursor
def main():
parser = argparse.ArgumentParser(prog='query', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n',
dest='num_variants',
type=int,
metavar='INTEGER',
help='The number of variants')
parser.add_argument('-i',
dest='num_indivs',
type=int,
metavar='INTEGER',
help='The number of individuals')
parser.add_argument('-s',
dest='name_stub',
type=str,
metavar='STRING',
help='Stub for the file names')
args = parser.parse_args()
norm_cursor, comp_cursor = connect_to_dbs(args)
# header
print '\t'.join(["strategy", "exp", "matches", "time"])
# time a query to thhe normalized database
import timeit
start_time = timeit.default_timer()
num_matches = 0
for row in norm_cursor.execute("""SELECT distinct v.v_id FROM variants v, genotypes g
WHERE v.v_id = g.v_id
AND g.gt = 1
AND g.s_id = 7"""):
num_matches +=1
elapsed = timeit.default_timer() - start_time
print '\t'.join(["norm", "one_ind", str(num_matches), str(elapsed)])
# time the identical query to thhe compressed database
start_time = timeit.default_timer()
num_matches = 0
for row in comp_cursor.execute("""SELECT * FROM variants"""):
gts = unpack_genotype_blob(row['gts'])
if gts[6] == 1:
num_matches +=1
elapsed = timeit.default_timer() - start_time
print '\t'.join(["comp", "one_ind", str(num_matches), str(elapsed)])
if __name__ == "__main__":
main()
# create normalized and compressed (existing strategy) databases simulating 100000 variants and 100 samples
python simulation.py -n 100000 -i 100 -s test
# ls -1
test.norm.v100000.i100.db
test.comp.v100000.i100.db
# test the run times for identifying rows where a single individual has a specific genotype.
python querying.py -n 100000 -i 100 -s test
strategy exp matches time
norm one_ind 24982 0.16601395607
comp one_ind 24982 2.40940999985
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import numpy
import sys
import os
import argparse
import timeit
# python simulation-postgres.py -n 1000000 -i 100 -s test
"""
Note: you must have a PostreSQL server running. For OSX (other POSIX?)
start:
pg_ctl -D /usr/local/var/postgres -l /usr/local/var/postgres/server.log start
stop:
pg_ctl -D /usr/local/var/postgres stop -s -m fast
"""
def create_tables(cursor):
cursor.execute('''drop table if exists variants''')
cursor.execute('''create table if not exists variants ( \
v_id integer, \
chrom text, \
start integer, \
gts integer[], \
PRIMARY KEY(v_id))''')
def create_indexes(cursor):
cursor.execute('''create index vid_idx on variants using GIN (gts)''')
def insert_variation(cursor, buffer):
cursor.execute("BEGIN TRANSACTION")
cursor.executemany('insert into variants values (%s,%s,%s,%s)', buffer)
cursor.execute("END TRANSACTION")
def connect_to_dbs(args):
conn = psycopg2.connect(dbname='postgres', host='localhost')
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
curr = conn.cursor()
curr.execute('DROP DATABASE IF EXISTS ' + args.name_stub)
curr.execute('CREATE DATABASE ' + args.name_stub)
create_tables(curr)
return conn, curr
def main():
parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n',
dest='num_variants',
type=int,
metavar='INTEGER',
help='The number of variants')
parser.add_argument('-i',
dest='num_indivs',
type=int,
metavar='INTEGER',
help='The number of individuals')
parser.add_argument('-s',
dest='name_stub',
type=str,
metavar='STRING',
help='Stub for the file names')
args = parser.parse_args()
conn, cur = connect_to_dbs(args)
var_arry_buffer = []
num_variants = args.num_variants
num_samples = args.num_indivs
buffer_size = 0
# Load the database with simulated variants and genotypes.
start_time = timeit.default_timer()
for v_id in range(num_variants):
# simulate num_samples genotypes for this variant
genotypes = list(numpy.random.randint(4, size=num_samples))
# add the variant and genotypes to the buffers.
var_arry_buffer.append((v_id+1, 'chr1', v_id+1, genotypes))
buffer_size += 1
if buffer_size % 100000 == 0:
print "inserting 100000 variants"
insert_variation(cur, var_arry_buffer)
# reset
var_arry_buffer = []
insert_variation(cur, var_arry_buffer)
create_indexes(cur)
conn.commit()
elapsed = timeit.default_timer() - start_time
print "Loading and indexing:" + str(elapsed)
# Test the query time.
start_time = timeit.default_timer()
hits = 0
cur.execute("""SELECT * FROM variants
WHERE gts[7] =1
AND gts[9] =0
AND gts[17] =2""")
for row in cur:
hits += 1
#print row[0], row[3][6], row[3][8], row[3][16]
elapsed = timeit.default_timer() - start_time
print "Querying on 3 specific genotypes:" + str(elapsed)
if __name__ == "__main__":
main()
import sqlite3
import zlib
import cPickle
import numpy
import sys
import os
import argparse
def create_tables(cursor, db_type):
if db_type == "norm":
cursor.execute('''create table if not exists variants ( \
v_id integer, \
chrom text, \
start integer, \
PRIMARY KEY(v_id ASC))''')
cursor.execute('''create table if not exists genotypes ( \
v_id integer, \
s_id integer, \
gt integer, \
PRIMARY KEY(v_id, s_id ASC))''')
elif db_type == "comp":
cursor.execute('''create table if not exists variants ( \
v_id integer, \
chrom text, \
start integer, \
gts BLOB, \
PRIMARY KEY(v_id ASC))''')
def create_indexes(cursor, db_type):
cursor.execute('''create index vid_idx on variants(v_id)''')
if db_type == "norm":
cursor.execute('''create index vid_idx2 on genotypes(v_id)''')
cursor.execute('''create index sid_idx on genotypes(s_id, gt)''')
def insert_variation(cursor, buffer, db_type):
if db_type == "norm":
cursor.execute("BEGIN TRANSACTION")
cursor.executemany('insert into variants values (?,?,?)', buffer)
cursor.execute("END TRANSACTION")
elif db_type == "comp":
cursor.execute("BEGIN TRANSACTION")
cursor.executemany('insert into variants values (?,?,?,?)', buffer)
cursor.execute("END TRANSACTION")
def insert_genotypes(cursor, buffer, db_type):
if db_type == "norm":
cursor.execute("BEGIN TRANSACTION")
cursor.executemany('insert into genotypes values (?,?,?)', buffer)
cursor.execute("END TRANSACTION")
def zdumps(obj):
return zlib.compress(cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL), 9)
def pack_blob(obj):
return sqlite3.Binary(zdumps(obj))
def connect_to_dbs(args):
# open up a new database
db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
if os.path.exists(db_norm):
os.remove(db_norm)
if os.path.exists(db_comp):
os.remove(db_comp)
norm_conn = sqlite3.connect(db_norm)
comp_conn = sqlite3.connect(db_comp)
norm_conn.isolation_level = None
comp_conn.isolation_level = None
norm_cursor = norm_conn.cursor()
comp_cursor = comp_conn.cursor()
norm_cursor.execute('PRAGMA synchronous = OFF')
norm_cursor.execute('PRAGMA journal_mode=MEMORY')
comp_cursor.execute('PRAGMA synchronous = OFF')
comp_cursor.execute('PRAGMA journal_mode=MEMORY')
# create the gemini database tables for the new DBs
create_tables(norm_cursor, "norm")
create_tables(comp_cursor, "comp")
return norm_cursor, comp_cursor
def main():
parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n',
dest='num_variants',
type=int,
metavar='INTEGER',
help='The number of variants')
parser.add_argument('-i',
dest='num_indivs',
type=int,
metavar='INTEGER',
help='The number of individuals')
parser.add_argument('-s',
dest='name_stub',
type=str,
metavar='STRING',
help='Stub for the file names')
args = parser.parse_args()
norm_cursor, comp_cursor = connect_to_dbs(args)
var_norm_buffer = []
var_comp_buffer = []
gts_buffer = []
num_variants = args.num_variants
num_samples = args.num_indivs
buffer_size = 0
for v_id in range(num_variants):
# simulate num_samples genotypes for this variant
genotypes = numpy.random.randint(4, size=num_samples)
# add the variant and genotypes to the buffers.
var_norm_buffer.append([v_id+1, 'chr1', v_id+1])
var_comp_buffer.append([v_id+1, 'chr1', v_id+1, pack_blob(genotypes)])
for idx, gt in enumerate(genotypes):
gts_buffer.append([v_id+1, idx+1, int(gt)])
buffer_size += 1
if buffer_size % 10000 == 0:
print "inserting 10000 variants"
# add the variants to the "normalized database"
insert_variation(norm_cursor, var_norm_buffer, "norm")
insert_genotypes(norm_cursor, gts_buffer, "norm")
# add the variants to the "compressed database"
insert_variation(comp_cursor, var_comp_buffer, "comp")
# reset
var_norm_buffer = []
var_comp_buffer = []
gts_buffer = []
create_indexes(norm_cursor, "norm")
create_indexes(comp_cursor, "curr")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment