kmuthukk/sql_json_read_write.py

## sql_json_read_write.py
# Dependencies:
# On CentOS you can install psycopg2 thus:
#
#  sudo yum install postgresql-libs
#  sudo yum install python-psycopg2

import psycopg2;
import time
import random
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial

connect_str="host=localhost dbname=yugabyte user=yugabyte port=5433"

# Load Phase params
num_write_threads=2
num_users=10
num_actions_per_user=1000

# Read Only Phase params
num_reads=100

def create_table(col_type, num_attrs):
  conn = psycopg2.connect(connect_str)
  conn.set_session(autocommit=True)
  cur = conn.cursor()
  start_time = time.time()
  cur.execute("""DROP TABLE IF EXISTS user_activity""");
  now_time = time.time()
  print("Dropped (if exists): user_activity table")
  print("Time: %s ms" % ((now_time - start_time) * 1000))
  print("====================")
  start_time = time.time()
  cur.execute("""
        CREATE TABLE IF NOT EXISTS user_activity(
           id        text,
           action_id integer,
           action    """ + col_type + """,
           PRIMARY KEY(id, action_id)
        )
    """)
  now_time = time.time()
  print("Created user_activity table")
  print("Time: %s ms" % ((now_time - start_time) * 1000))
  print("====================")


def load_data_worker(col_type, num_attrs, thread_num):
  thread_id = str(thread_num)
  conn = psycopg2.connect(connect_str)
  conn.set_session(autocommit=True)
  cur = conn.cursor()

  print("Thread-" + thread_id + ": Inserting %d users with %d actions with %d attrs in %s column "
        % (num_users, num_actions_per_user, num_attrs, col_type))
  start_time = time.time()
  for idx in range(num_users):
    for jdx in range(num_actions_per_user):
      json_val = '{';
      for kdx in range(num_attrs):
        separator = ', ' if kdx != 0 else ''
        json_val = json_val + separator + '"attr_' + str(kdx) + '" : ' + str(1000*jdx + kdx)
      json_val = json_val + '}'
      cur.execute("""INSERT INTO user_activity (id, action_id, action) VALUES (%s, %s, %s)""",
                  ("user-"+thread_id+"-"+str(idx),
                   jdx,
                   json_val))

  now_time = time.time()
  rows_inserted = num_users * num_actions_per_user
  print("Thread-" + thread_id + ": Inserted %d rows" %  (rows_inserted))
  print("Thread-" + thread_id + ": Time: %s ms" % ((now_time - start_time) * 1000))
  print("Thread-" + thread_id + ": Inserts/sec: %s" % (rows_inserted / (now_time - start_time)))
  print("Thread-" + thread_id + ": Avg Time: %s ms" % ((now_time - start_time) * 1000 / (rows_inserted)))

def load_data(col_type, num_attrs):
  pool = ThreadPool(num_write_threads)
  t1 = time.time()
  results = pool.map(partial(load_data_worker, col_type, num_attrs), range(num_write_threads))
  t2 = time.time()
  total_rows=num_users*num_write_threads
  print("====================")
  print("Inserted %d users with %d actions and %d attributes per action (type %s)" %
        (total_rows, num_actions_per_user, num_attrs, col_type))
  print("Total Time: %s ms" % ((t2 - t1) * 1000))
  print("Inserts/sec: %s" % (total_rows / (t2 - t1)))
  print("====================")

def read_data(col_type, num_attrs):
  conn = psycopg2.connect(connect_str)
  conn.set_session(autocommit=True)
  cur = conn.cursor()

  cur.execute("PREPARE my_select(text) AS SELECT action_id, action FROM user_activity WHERE id=$1")
  delta = 0
  for i in range(num_reads):
    # generate a random writer thread id + user id to read
    rand_thread_id = str(random.randint(0, num_write_threads-1))
    rand_user_id = str(random.randint(0, num_users-1))

    user_id = "user-"+rand_thread_id+"-"+rand_user_id;
    start_time = time.time()
    cur.execute("EXECUTE my_select(%s)", (user_id, ))
    rows = cur.fetchall()
    now_time = time.time()
    delta = delta + (now_time - start_time)

    # For debugging
    # print("Found=" + str(rows))

  print("Avg Time For Range Query: %s ms (returns %d rows each of %d attrs in %s column)" %
        ((delta * 1000) / num_reads, num_actions_per_user, num_attrs, col_type))


# Main
for col_type in ["jsonb", "json", "text"]:
  for attr_count in [1, 4, 40, 100, 400]:
    create_table(col_type, attr_count)
    load_data(col_type, attr_count)
    read_data(col_type, attr_count)
	# Dependencies:
	# On CentOS you can install psycopg2 thus:
	#
	# sudo yum install postgresql-libs
	# sudo yum install python-psycopg2

	import psycopg2;
	import time
	import random
	from multiprocessing.dummy import Pool as ThreadPool
	from functools import partial

	connect_str="host=localhost dbname=yugabyte user=yugabyte port=5433"

	# Load Phase params
	num_write_threads=2
	num_users=10
	num_actions_per_user=1000

	# Read Only Phase params
	num_reads=100

	def create_table(col_type, num_attrs):
	conn = psycopg2.connect(connect_str)
	conn.set_session(autocommit=True)
	cur = conn.cursor()
	start_time = time.time()
	cur.execute("""DROP TABLE IF EXISTS user_activity""");
	now_time = time.time()
	print("Dropped (if exists): user_activity table")
	print("Time: %s ms" % ((now_time - start_time) * 1000))
	print("====================")
	start_time = time.time()
	cur.execute("""
	CREATE TABLE IF NOT EXISTS user_activity(
	id text,
	action_id integer,
	action """ + col_type + """,
	PRIMARY KEY(id, action_id)
	)
	""")
	now_time = time.time()
	print("Created user_activity table")
	print("Time: %s ms" % ((now_time - start_time) * 1000))
	print("====================")


	def load_data_worker(col_type, num_attrs, thread_num):
	thread_id = str(thread_num)
	conn = psycopg2.connect(connect_str)
	conn.set_session(autocommit=True)
	cur = conn.cursor()

	print("Thread-" + thread_id + ": Inserting %d users with %d actions with %d attrs in %s column "
	% (num_users, num_actions_per_user, num_attrs, col_type))
	start_time = time.time()
	for idx in range(num_users):
	for jdx in range(num_actions_per_user):
	json_val = '{';
	for kdx in range(num_attrs):
	separator = ', ' if kdx != 0 else ''
	json_val = json_val + separator + '"attr_' + str(kdx) + '" : ' + str(1000*jdx + kdx)
	json_val = json_val + '}'
	cur.execute("""INSERT INTO user_activity (id, action_id, action) VALUES (%s, %s, %s)""",
	("user-"+thread_id+"-"+str(idx),
	jdx,
	json_val))

	now_time = time.time()
	rows_inserted = num_users * num_actions_per_user
	print("Thread-" + thread_id + ": Inserted %d rows" % (rows_inserted))
	print("Thread-" + thread_id + ": Time: %s ms" % ((now_time - start_time) * 1000))
	print("Thread-" + thread_id + ": Inserts/sec: %s" % (rows_inserted / (now_time - start_time)))
	print("Thread-" + thread_id + ": Avg Time: %s ms" % ((now_time - start_time) * 1000 / (rows_inserted)))

	def load_data(col_type, num_attrs):
	pool = ThreadPool(num_write_threads)
	t1 = time.time()
	results = pool.map(partial(load_data_worker, col_type, num_attrs), range(num_write_threads))
	t2 = time.time()
	total_rows=num_users*num_write_threads
	print("====================")
	print("Inserted %d users with %d actions and %d attributes per action (type %s)" %
	(total_rows, num_actions_per_user, num_attrs, col_type))
	print("Total Time: %s ms" % ((t2 - t1) * 1000))
	print("Inserts/sec: %s" % (total_rows / (t2 - t1)))
	print("====================")

	def read_data(col_type, num_attrs):
	conn = psycopg2.connect(connect_str)
	conn.set_session(autocommit=True)
	cur = conn.cursor()

	cur.execute("PREPARE my_select(text) AS SELECT action_id, action FROM user_activity WHERE id=$1")
	delta = 0
	for i in range(num_reads):
	# generate a random writer thread id + user id to read
	rand_thread_id = str(random.randint(0, num_write_threads-1))
	rand_user_id = str(random.randint(0, num_users-1))

	user_id = "user-"+rand_thread_id+"-"+rand_user_id;
	start_time = time.time()
	cur.execute("EXECUTE my_select(%s)", (user_id, ))
	rows = cur.fetchall()
	now_time = time.time()
	delta = delta + (now_time - start_time)

	# For debugging
	# print("Found=" + str(rows))

	print("Avg Time For Range Query: %s ms (returns %d rows each of %d attrs in %s column)" %
	((delta * 1000) / num_reads, num_actions_per_user, num_attrs, col_type))



	# Main
	for col_type in ["jsonb", "json", "text"]:
	for attr_count in [1, 4, 40, 100, 400]:
	create_table(col_type, attr_count)
	load_data(col_type, attr_count)
	read_data(col_type, attr_count)