Skip to content

Instantly share code, notes, and snippets.

@kmuthukk
Created September 27, 2020 03:14
Show Gist options
  • Save kmuthukk/3d6b325b80c5f5474d7d391f365d51bb to your computer and use it in GitHub Desktop.
Save kmuthukk/3d6b325b80c5f5474d7d391f365d51bb to your computer and use it in GitHub Desktop.
Test case for GROUP BY index_column LIMIT n using table scan instead of index scan
# Dependencies:
# On CentOS you can install psycopg2 thus:
#
# sudo yum install postgresql-libs
# sudo yum install python-psycopg2
import psycopg2;
import time;
from datetime import datetime
from multiprocessing.dummy import Pool as ThreadPool
def create_table(table_name, index_type):
conn = psycopg2.connect("host=localhost dbname=postgres user=postgres port=5433")
conn.set_session(autocommit=True)
cur = conn.cursor()
cur.execute("""DROP TABLE IF EXISTS """ + table_name);
cur.execute("""CREATE TABLE IF NOT EXISTS """ + table_name + """(
k text not null,
v text not null,
PRIMARY KEY((k) HASH))
""")
print("Created table: " + table_name)
table_name_v_idx = table_name + "_v_idx"
cur.execute("""CREATE INDEX """ + table_name_v_idx + """ ON """ + table_name + """(v {})""".format(index_type))
print("Created index: " + table_name_v_idx)
def load_data(table_name):
conn = psycopg2.connect("host=localhost dbname=postgres user=postgres port=5433")
conn.set_session(autocommit=True)
cur = conn.cursor()
num_rows = 200
for idx in range(num_rows):
cur.execute("INSERT INTO " + table_name + "(k, v) VALUES (%s, %s)",
("k-"+str(idx), "v-"+str(idx)))
print("Inserted %d rows" % (num_rows))
def print_select_results(records):
for record in records:
print(record)
print("===========")
def select_data():
conn = psycopg2.connect("host=localhost dbname=postgres user=postgres port=5433")
conn.set_session(autocommit=True)
cur = conn.cursor()
for limit in [100, 200]:
print("Testing with limit: {}".format(limit))
cur.execute("""EXPLAIN SELECT v, count(*) FROM tab GROUP BY v limit {}""".format(limit))
records = cur.fetchall()
print_select_results(records)
# Main
print("Running tests with ASC index for column v")
create_table("tab", "ASC")
load_data("tab")
select_data()
print("===========================================")
print("Running tests with HASH index for column v")
create_table("tab", "HASH")
load_data("tab")
select_data()
print("===========================================")
@kmuthukk
Copy link
Author

$ python3 ~/notes/ysql_group_by_index_col.py
Running tests with ASC index for column v
Created table: tab
Created index: tab_v_idx
Inserted 200 rows
Testing with limit: 100
('Limit  (cost=0.00..60.50 rows=100 width=40)',)
('  ->  GroupAggregate  (cost=0.00..121.00 rows=200 width=40)',)
('        Group Key: v',)
('        ->  Index Only Scan using tab_v_idx on tab  (cost=0.00..114.00 rows=1000 width=32)',)
=====================
Testing with limit: 200
('Limit  (cost=105.00..107.00 rows=200 width=40)',)
('  ->  HashAggregate  (cost=105.00..107.00 rows=200 width=40)',)
('        Group Key: v',)
('        ->  Foreign Scan on tab  (cost=0.00..100.00 rows=1000 width=32)',)
=====================
===========================================
Running tests with HASH index for column v
Created table: tab
Created index: tab_v_idx
Inserted 200 rows
Testing with limit: 100
('Limit  (cost=105.00..106.00 rows=100 width=40)',)
('  ->  HashAggregate  (cost=105.00..107.00 rows=200 width=40)',)
('        Group Key: v',)
('        ->  Foreign Scan on tab  (cost=0.00..100.00 rows=1000 width=32)',)
=====================
Testing with limit: 200
('Limit  (cost=105.00..107.00 rows=200 width=40)',)
('  ->  HashAggregate  (cost=105.00..107.00 rows=200 width=40)',)
('        Group Key: v',)
('        ->  Foreign Scan on tab  (cost=0.00..100.00 rows=1000 width=32)',)
=====================
===========================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment