arq5x/aws-setup.sh

## aws-setup.sh
# update things
sudo apt-get update

# get postgres up and running
sudo apt-get install postgresql-9.3
sudo apt-get install postgresql-server-dev-9.3
sudo apt-get install postgresql-client
sudo apt-get install postgresql postgresql-contrib
sudo -u postgres psql postgres
\password postgres
	(######)

# grab pip and python-dev (req'd for psycopg2)
sudo apt-get install python-pip
sudo apt-get install libpq-dev python-dev
# need psycopg2 to connect to PGSQL and numpy to simulate genotypes
sudo pip install psycopg2
sudo pip install numpy

## querying.py
import sqlite3
import zlib
import cPickle
import numpy
import sys
import os
import argparse

def unpack_genotype_blob(blob):
    return numpy.array(cPickle.loads(zlib.decompress(blob)))

def connect_to_dbs(args):
	db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
	db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"

	norm_conn = sqlite3.connect(db_norm)
	comp_conn = sqlite3.connect(db_comp)

	norm_conn.row_factory = sqlite3.Row
	comp_conn.row_factory = sqlite3.Row

	norm_cursor = norm_conn.cursor()
	comp_cursor = comp_conn.cursor()

	return norm_cursor, comp_cursor

def main():

	parser = argparse.ArgumentParser(prog='query', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-n',
    	dest='num_variants',
    	type=int,
        metavar='INTEGER',
        help='The number of variants')
	parser.add_argument('-i',
    	dest='num_indivs',
    	type=int,
    	metavar='INTEGER',
    	help='The number of individuals')
	parser.add_argument('-s',
    	dest='name_stub',
    	type=str,
    	metavar='STRING',
    	help='Stub for the file names')
	args = parser.parse_args()

	norm_cursor, comp_cursor = connect_to_dbs(args)

	# header
	print '\t'.join(["strategy", "exp", "matches", "time"])

	# time a query to thhe normalized database
	import timeit
	start_time = timeit.default_timer()
	num_matches = 0
	for row in norm_cursor.execute("""SELECT distinct v.v_id FROM variants v, genotypes g
		                              WHERE v.v_id = g.v_id
		                              AND g.gt = 1
		                              AND g.s_id = 7"""):
		num_matches +=1
	elapsed = timeit.default_timer() - start_time
	print '\t'.join(["norm", "one_ind", str(num_matches), str(elapsed)])

	# time the identical query to thhe compressed database
	start_time = timeit.default_timer()
	num_matches = 0
	for row in comp_cursor.execute("""SELECT * FROM variants"""):
		gts = unpack_genotype_blob(row['gts'])
		if gts[6] == 1:
			num_matches +=1
	elapsed = timeit.default_timer() - start_time
	print '\t'.join(["comp", "one_ind", str(num_matches), str(elapsed)])

if __name__ == "__main__":
    main()


## run.sh
# create normalized and compressed (existing strategy) databases simulating 100000 variants and 100 samples
python simulation.py -n 100000 -i 100 -s test

# ls -1
test.norm.v100000.i100.db
test.comp.v100000.i100.db

# test the run times for identifying rows where a single individual has a specific genotype.
python querying.py -n 100000 -i 100 -s test
strategy	exp	matches	time
norm	one_ind	24982	0.16601395607
comp	one_ind	24982	2.40940999985

## simulation-postgres.py
import  psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import numpy
import sys
import os
import argparse
import timeit

# python simulation-postgres.py -n 1000000 -i 100 -s test

"""
Note: you must have a PostreSQL server running. For OSX (other POSIX?)
 start:
   pg_ctl -D /usr/local/var/postgres -l /usr/local/var/postgres/server.log start
 stop:
   pg_ctl -D /usr/local/var/postgres stop -s -m fast
"""

def create_tables(cursor):
	cursor.execute('''drop table if exists variants''')
	cursor.execute('''create table if not exists variants  ( \
                v_id integer,                          \
                chrom text,                            \
                start integer,                         \
                gts integer[],                               \
                PRIMARY KEY(v_id))''')

def create_indexes(cursor):
	cursor.execute('''create index vid_idx on variants using GIN (gts)''')

def insert_variation(cursor, buffer):
    cursor.execute("BEGIN TRANSACTION")
    cursor.executemany('insert into variants values (%s,%s,%s,%s)', buffer)
    cursor.execute("END TRANSACTION")

def connect_to_dbs(args):

	conn =  psycopg2.connect(dbname='postgres', host='localhost')
	conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
	curr = conn.cursor()
	curr.execute('DROP DATABASE IF EXISTS ' + args.name_stub)
	curr.execute('CREATE DATABASE ' + args.name_stub)

	create_tables(curr)
	return conn, curr


def main():

	parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-n',
    	dest='num_variants',
    	type=int,
        metavar='INTEGER',
        help='The number of variants')
	parser.add_argument('-i',
    	dest='num_indivs',
    	type=int,
    	metavar='INTEGER',
    	help='The number of individuals')
	parser.add_argument('-s',
    	dest='name_stub',
    	type=str,
    	metavar='STRING',
    	help='Stub for the file names')
	args = parser.parse_args()

	conn, cur = connect_to_dbs(args)

	var_arry_buffer = []
	num_variants = args.num_variants
	num_samples = args.num_indivs
	buffer_size = 0

	# Load the database with simulated variants and genotypes.
	start_time = timeit.default_timer()
	for v_id in range(num_variants):
		# simulate num_samples genotypes for this variant
		genotypes = list(numpy.random.randint(4, size=num_samples))

		# add the variant and genotypes to the buffers.
		var_arry_buffer.append((v_id+1, 'chr1', v_id+1, genotypes))

		buffer_size += 1
		if buffer_size % 100000 == 0:
			print "inserting 100000 variants"
			insert_variation(cur, var_arry_buffer)

			# reset
			var_arry_buffer = []

	insert_variation(cur, var_arry_buffer)
	create_indexes(cur)
	conn.commit()
	elapsed = timeit.default_timer() - start_time
	print "Loading and indexing:" + str(elapsed)


	# Test the query time.
	start_time = timeit.default_timer()
	hits = 0
	cur.execute("""SELECT * FROM variants
		                    WHERE gts[7] =1
		                    AND   gts[9] =0
		                    AND   gts[17] =2""")
	for row in cur:
		hits += 1
		#print row[0], row[3][6], row[3][8], row[3][16]
	elapsed = timeit.default_timer() - start_time
	print "Querying on 3 specific genotypes:" + str(elapsed)


if __name__ == "__main__":
    main()


## simulation.py
import sqlite3
import zlib
import cPickle
import numpy
import sys
import os
import argparse

def create_tables(cursor, db_type):
	if db_type == "norm":
	    cursor.execute('''create table if not exists variants  ( \
                    v_id integer,                          \
                    chrom text,                            \
                    start integer,                         \
                    PRIMARY KEY(v_id ASC))''')
	    cursor.execute('''create table if not exists genotypes  ( \
                    v_id integer,                           \
                    s_id integer,                            \
                    gt integer,                             \
                    PRIMARY KEY(v_id, s_id ASC))''')
	elif db_type == "comp":
		cursor.execute('''create table if not exists variants  ( \
                    v_id integer,                          \
                    chrom text,                            \
                    start integer,                         \
                    gts BLOB,                               \
                    PRIMARY KEY(v_id ASC))''')

def create_indexes(cursor, db_type):
	cursor.execute('''create index vid_idx on variants(v_id)''')
	if db_type == "norm":
		cursor.execute('''create index vid_idx2 on genotypes(v_id)''')
		cursor.execute('''create index sid_idx on genotypes(s_id, gt)''')

def insert_variation(cursor, buffer, db_type):
	if db_type == "norm":
	    cursor.execute("BEGIN TRANSACTION")
	    cursor.executemany('insert into variants values (?,?,?)', buffer)
	    cursor.execute("END TRANSACTION")
	elif db_type == "comp":
	    cursor.execute("BEGIN TRANSACTION")
	    cursor.executemany('insert into variants values (?,?,?,?)', buffer)
	    cursor.execute("END TRANSACTION")

def insert_genotypes(cursor, buffer, db_type):
	if db_type == "norm":
	    cursor.execute("BEGIN TRANSACTION")
	    cursor.executemany('insert into genotypes values (?,?,?)', buffer)
	    cursor.execute("END TRANSACTION")

def zdumps(obj):
    return zlib.compress(cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL), 9)

def pack_blob(obj):
    return sqlite3.Binary(zdumps(obj))

def connect_to_dbs(args):
	# open up a new database
	db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
	db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"

	if os.path.exists(db_norm):
	    os.remove(db_norm)

	if os.path.exists(db_comp):
	    os.remove(db_comp)

	norm_conn = sqlite3.connect(db_norm)
	comp_conn = sqlite3.connect(db_comp)

	norm_conn.isolation_level = None
	comp_conn.isolation_level = None

	norm_cursor = norm_conn.cursor()
	comp_cursor = comp_conn.cursor()

	norm_cursor.execute('PRAGMA synchronous = OFF')
	norm_cursor.execute('PRAGMA journal_mode=MEMORY')
	comp_cursor.execute('PRAGMA synchronous = OFF')
	comp_cursor.execute('PRAGMA journal_mode=MEMORY')

	# create the gemini database tables for the new DBs
	create_tables(norm_cursor, "norm")
	create_tables(comp_cursor, "comp")

	return norm_cursor, comp_cursor

def main():

	parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-n',
    	dest='num_variants',
    	type=int,
        metavar='INTEGER',
        help='The number of variants')
	parser.add_argument('-i',
    	dest='num_indivs',
    	type=int,
    	metavar='INTEGER',
    	help='The number of individuals')
	parser.add_argument('-s',
    	dest='name_stub',
    	type=str,
    	metavar='STRING',
    	help='Stub for the file names')
	args = parser.parse_args()

	norm_cursor, comp_cursor = connect_to_dbs(args)

	var_norm_buffer = []
	var_comp_buffer = []
	gts_buffer = []

	num_variants = args.num_variants
	num_samples = args.num_indivs
	buffer_size = 0
	for v_id in range(num_variants):
		# simulate num_samples genotypes for this variant
		genotypes = numpy.random.randint(4, size=num_samples)

		# add the variant and genotypes to the buffers.
		var_norm_buffer.append([v_id+1, 'chr1', v_id+1])
		var_comp_buffer.append([v_id+1, 'chr1', v_id+1, pack_blob(genotypes)])
		for idx, gt in enumerate(genotypes):
			gts_buffer.append([v_id+1, idx+1, int(gt)])

		buffer_size += 1
		if buffer_size % 10000 == 0:
			print "inserting 10000 variants"
			# add the variants to the "normalized database"
			insert_variation(norm_cursor, var_norm_buffer, "norm")
			insert_genotypes(norm_cursor, gts_buffer, "norm")
			# add the variants to the "compressed database"
			insert_variation(comp_cursor, var_comp_buffer, "comp")

			# reset
			var_norm_buffer = []
			var_comp_buffer = []
			gts_buffer = []
	create_indexes(norm_cursor, "norm")
	create_indexes(comp_cursor, "curr")

if __name__ == "__main__":
    main()
	# update things
	sudo apt-get update

	# get postgres up and running
	sudo apt-get install postgresql-9.3
	sudo apt-get install postgresql-server-dev-9.3
	sudo apt-get install postgresql-client
	sudo apt-get install postgresql postgresql-contrib
	sudo -u postgres psql postgres
	\password postgres
	(######)

	# grab pip and python-dev (req'd for psycopg2)
	sudo apt-get install python-pip
	sudo apt-get install libpq-dev python-dev
	# need psycopg2 to connect to PGSQL and numpy to simulate genotypes
	sudo pip install psycopg2
	sudo pip install numpy
	import sqlite3
	import zlib
	import cPickle
	import numpy
	import sys
	import os
	import argparse

	def unpack_genotype_blob(blob):
	return numpy.array(cPickle.loads(zlib.decompress(blob)))

	def connect_to_dbs(args):
	db_norm = args.name_stub + ".norm.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"
	db_comp = args.name_stub + ".comp.v" + str(args.num_variants) + ".i" + str(args.num_indivs) +".db"

	norm_conn = sqlite3.connect(db_norm)
	comp_conn = sqlite3.connect(db_comp)

	norm_conn.row_factory = sqlite3.Row
	comp_conn.row_factory = sqlite3.Row

	norm_cursor = norm_conn.cursor()
	comp_cursor = comp_conn.cursor()

	return norm_cursor, comp_cursor

	def main():

	parser = argparse.ArgumentParser(prog='query', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-n',
	dest='num_variants',
	type=int,
	metavar='INTEGER',
	help='The number of variants')
	parser.add_argument('-i',
	dest='num_indivs',
	type=int,
	metavar='INTEGER',
	help='The number of individuals')
	parser.add_argument('-s',
	dest='name_stub',
	type=str,
	metavar='STRING',
	help='Stub for the file names')
	args = parser.parse_args()

	norm_cursor, comp_cursor = connect_to_dbs(args)

	# header
	print '\t'.join(["strategy", "exp", "matches", "time"])

	# time a query to thhe normalized database
	import timeit
	start_time = timeit.default_timer()
	num_matches = 0
	for row in norm_cursor.execute("""SELECT distinct v.v_id FROM variants v, genotypes g
	WHERE v.v_id = g.v_id
	AND g.gt = 1
	AND g.s_id = 7"""):
	num_matches +=1
	elapsed = timeit.default_timer() - start_time
	print '\t'.join(["norm", "one_ind", str(num_matches), str(elapsed)])

	# time the identical query to thhe compressed database
	start_time = timeit.default_timer()
	num_matches = 0
	for row in comp_cursor.execute("""SELECT * FROM variants"""):
	gts = unpack_genotype_blob(row['gts'])
	if gts[6] == 1:
	num_matches +=1
	elapsed = timeit.default_timer() - start_time
	print '\t'.join(["comp", "one_ind", str(num_matches), str(elapsed)])

	if __name__ == "__main__":
	main()
	# create normalized and compressed (existing strategy) databases simulating 100000 variants and 100 samples
	python simulation.py -n 100000 -i 100 -s test

	# ls -1
	test.norm.v100000.i100.db
	test.comp.v100000.i100.db

	# test the run times for identifying rows where a single individual has a specific genotype.
	python querying.py -n 100000 -i 100 -s test
	strategy exp matches time
	norm one_ind 24982 0.16601395607
	comp one_ind 24982 2.40940999985
	import psycopg2
	from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
	import numpy
	import sys
	import os
	import argparse
	import timeit

	# python simulation-postgres.py -n 1000000 -i 100 -s test

	"""
	Note: you must have a PostreSQL server running. For OSX (other POSIX?)
	start:
	pg_ctl -D /usr/local/var/postgres -l /usr/local/var/postgres/server.log start
	stop:
	pg_ctl -D /usr/local/var/postgres stop -s -m fast
	"""

	def create_tables(cursor):
	cursor.execute('''drop table if exists variants''')
	cursor.execute('''create table if not exists variants ( \
	v_id integer, \
	chrom text, \
	start integer, \
	gts integer[], \
	PRIMARY KEY(v_id))''')

	def create_indexes(cursor):
	cursor.execute('''create index vid_idx on variants using GIN (gts)''')

	def insert_variation(cursor, buffer):
	cursor.execute("BEGIN TRANSACTION")
	cursor.executemany('insert into variants values (%s,%s,%s,%s)', buffer)
	cursor.execute("END TRANSACTION")

	def connect_to_dbs(args):

	conn = psycopg2.connect(dbname='postgres', host='localhost')
	conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
	curr = conn.cursor()
	curr.execute('DROP DATABASE IF EXISTS ' + args.name_stub)
	curr.execute('CREATE DATABASE ' + args.name_stub)

	create_tables(curr)
	return conn, curr


	def main():

	parser = argparse.ArgumentParser(prog='gemini_sim', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-n',
	dest='num_variants',
	type=int,
	metavar='INTEGER',
	help='The number of variants')
	parser.add_argument('-i',
	dest='num_indivs',
	type=int,
	metavar='INTEGER',
	help='The number of individuals')
	parser.add_argument('-s',
	dest='name_stub',
	type=str,
	metavar='STRING',
	help='Stub for the file names')
	args = parser.parse_args()

	conn, cur = connect_to_dbs(args)

	var_arry_buffer = []
	num_variants = args.num_variants
	num_samples = args.num_indivs
	buffer_size = 0

	# Load the database with simulated variants and genotypes.
	start_time = timeit.default_timer()
	for v_id in range(num_variants):
	# simulate num_samples genotypes for this variant
	genotypes = list(numpy.random.randint(4, size=num_samples))

	# add the variant and genotypes to the buffers.
	var_arry_buffer.append((v_id+1, 'chr1', v_id+1, genotypes))

	buffer_size += 1
	if buffer_size % 100000 == 0:
	print "inserting 100000 variants"
	insert_variation(cur, var_arry_buffer)

	# reset
	var_arry_buffer = []

	insert_variation(cur, var_arry_buffer)
	create_indexes(cur)
	conn.commit()
	elapsed = timeit.default_timer() - start_time
	print "Loading and indexing:" + str(elapsed)


	# Test the query time.
	start_time = timeit.default_timer()
	hits = 0
	cur.execute("""SELECT * FROM variants
	WHERE gts[7] =1
	AND gts[9] =0
	AND gts[17] =2""")
	for row in cur:
	hits += 1
	#print row[0], row[3][6], row[3][8], row[3][16]
	elapsed = timeit.default_timer() - start_time
	print "Querying on 3 specific genotypes:" + str(elapsed)



	if __name__ == "__main__":
	main()