Last active
June 8, 2022 18:15
-
-
Save kmuthukk/5eb29dc11ab736c3fe71f37e84fb95dc to your computer and use it in GitHub Desktop.
Program to test range scan performance on columns with jsonb payload and varying number of jsonb attributes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dependencies: | |
# On CentOS you can install psycopg2 thus: | |
# | |
# sudo yum install postgresql-libs | |
# sudo yum install python-psycopg2 | |
import psycopg2; | |
import time | |
import random | |
from multiprocessing.dummy import Pool as ThreadPool | |
from functools import partial | |
connect_str="host=localhost dbname=yugabyte user=yugabyte port=5433" | |
# Load Phase params | |
num_write_threads=2 | |
num_users=10 | |
num_actions_per_user=1000 | |
# Read Only Phase params | |
num_reads=100 | |
def create_table(col_type, num_attrs): | |
conn = psycopg2.connect(connect_str) | |
conn.set_session(autocommit=True) | |
cur = conn.cursor() | |
start_time = time.time() | |
cur.execute("""DROP TABLE IF EXISTS user_activity"""); | |
now_time = time.time() | |
print("Dropped (if exists): user_activity table") | |
print("Time: %s ms" % ((now_time - start_time) * 1000)) | |
print("====================") | |
start_time = time.time() | |
cur.execute(""" | |
CREATE TABLE IF NOT EXISTS user_activity( | |
id text, | |
action_id integer, | |
action """ + col_type + """, | |
PRIMARY KEY(id, action_id) | |
) | |
""") | |
now_time = time.time() | |
print("Created user_activity table") | |
print("Time: %s ms" % ((now_time - start_time) * 1000)) | |
print("====================") | |
def load_data_worker(col_type, num_attrs, thread_num): | |
thread_id = str(thread_num) | |
conn = psycopg2.connect(connect_str) | |
conn.set_session(autocommit=True) | |
cur = conn.cursor() | |
print("Thread-" + thread_id + ": Inserting %d users with %d actions with %d attrs in %s column " | |
% (num_users, num_actions_per_user, num_attrs, col_type)) | |
start_time = time.time() | |
for idx in range(num_users): | |
for jdx in range(num_actions_per_user): | |
json_val = '{'; | |
for kdx in range(num_attrs): | |
separator = ', ' if kdx != 0 else '' | |
json_val = json_val + separator + '"attr_' + str(kdx) + '" : ' + str(1000*jdx + kdx) | |
json_val = json_val + '}' | |
cur.execute("""INSERT INTO user_activity (id, action_id, action) VALUES (%s, %s, %s)""", | |
("user-"+thread_id+"-"+str(idx), | |
jdx, | |
json_val)) | |
now_time = time.time() | |
rows_inserted = num_users * num_actions_per_user | |
print("Thread-" + thread_id + ": Inserted %d rows" % (rows_inserted)) | |
print("Thread-" + thread_id + ": Time: %s ms" % ((now_time - start_time) * 1000)) | |
print("Thread-" + thread_id + ": Inserts/sec: %s" % (rows_inserted / (now_time - start_time))) | |
print("Thread-" + thread_id + ": Avg Time: %s ms" % ((now_time - start_time) * 1000 / (rows_inserted))) | |
def load_data(col_type, num_attrs): | |
pool = ThreadPool(num_write_threads) | |
t1 = time.time() | |
results = pool.map(partial(load_data_worker, col_type, num_attrs), range(num_write_threads)) | |
t2 = time.time() | |
total_rows=num_users*num_write_threads | |
print("====================") | |
print("Inserted %d users with %d actions and %d attributes per action (type %s)" % | |
(total_rows, num_actions_per_user, num_attrs, col_type)) | |
print("Total Time: %s ms" % ((t2 - t1) * 1000)) | |
print("Inserts/sec: %s" % (total_rows / (t2 - t1))) | |
print("====================") | |
def read_data(col_type, num_attrs): | |
conn = psycopg2.connect(connect_str) | |
conn.set_session(autocommit=True) | |
cur = conn.cursor() | |
cur.execute("PREPARE my_select(text) AS SELECT action_id, action FROM user_activity WHERE id=$1") | |
delta = 0 | |
for i in range(num_reads): | |
# generate a random writer thread id + user id to read | |
rand_thread_id = str(random.randint(0, num_write_threads-1)) | |
rand_user_id = str(random.randint(0, num_users-1)) | |
user_id = "user-"+rand_thread_id+"-"+rand_user_id; | |
start_time = time.time() | |
cur.execute("EXECUTE my_select(%s)", (user_id, )) | |
rows = cur.fetchall() | |
now_time = time.time() | |
delta = delta + (now_time - start_time) | |
# For debugging | |
# print("Found=" + str(rows)) | |
print("Avg Time For Range Query: %s ms (returns %d rows each of %d attrs in %s column)" % | |
((delta * 1000) / num_reads, num_actions_per_user, num_attrs, col_type)) | |
# Main | |
for col_type in ["jsonb", "json", "text"]: | |
for attr_count in [1, 4, 40, 100, 400]: | |
create_table(col_type, attr_count) | |
load_data(col_type, attr_count) | |
read_data(col_type, attr_count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment