Last active
August 22, 2018 20:49
-
-
Save oconnor663/717706d3891e4b3b6af0f2cf24911918 to your computer and use it in GitHub Desktop.
two tables in sqlite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python3 | |
import os | |
import sqlite3 | |
import time | |
filename = "two_tables.db" | |
# Delete the db file so we can recreate it from scratch. | |
if os.path.exists(filename): | |
os.remove(filename) | |
# Create the new db file. At first, don't create an emails index. | |
print("creating {}...".format(filename)) | |
conn = sqlite3.connect(filename) | |
conn.execute("""CREATE TABLE user_emails ( | |
name TEXT PRIMARY KEY, | |
email TEXT | |
)""") | |
conn.execute("""CREATE TABLE recent_users ( | |
email TEXT PRIMARY KEY, | |
access_time TEXT, | |
FOREIGN KEY (email) REFERENCES user_emails (email) | |
)""") | |
# Insert a million entries in user_emails, and a thousand in recent_users. | |
EMAILS_NUM = 1_000_000 | |
print("inserting", EMAILS_NUM, "users...") | |
for i in range(EMAILS_NUM): | |
name = "user" + str(i) | |
email = name + "@example.com" | |
# Note that Python's sqlite library automatically begins a transaction for | |
# us the first time we insert. If it didn't do this, individual inserts | |
# would be very slow. | |
conn.execute("INSERT INTO user_emails (name, email) VALUES (?, ?)", | |
(name, email)) | |
if i % 1000 == 0: | |
conn.execute( | |
"INSERT INTO recent_users (email, access_time) VALUES (?, ?)", | |
(email, "last Tuesday")) | |
conn.commit() | |
# IMPORTANT: Analyze the db. This gives sqlite a chance to save some metadata | |
# about the size of each table. If we skip this step, the query planner won't | |
# know that recent_users is much smaller than user_emails, and it won't choose | |
# the right index. | |
conn.execute("ANALYZE") | |
# Do the first SELECT, before an index had been added, and print out the time | |
# it took to run. | |
print("joining the two tables without an index...") | |
start = time.perf_counter() | |
count = conn.execute( | |
"SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)" | |
).fetchone()[0] | |
end = time.perf_counter() | |
print("joined {} rows in {:.2} seconds".format(count, end - start)) | |
# Now, add an index on the email column in user_emails. | |
print("creating an emails index...") | |
conn.execute("CREATE INDEX email_index ON user_emails (email)") | |
# Finally, do one more SELECT, over the same data as before. This time it | |
# should be able to take advantage of the index we just created, and it should | |
# be much faster. | |
print("joining the two tables with an index...") | |
start = time.perf_counter() | |
count = conn.execute( | |
"SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)" | |
).fetchone()[0] | |
end = time.perf_counter() | |
print("joined {} rows in {:.2} seconds".format(count, end - start)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment