Skip to content

Instantly share code, notes, and snippets.

@oconnor663
Last active August 22, 2018 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oconnor663/717706d3891e4b3b6af0f2cf24911918 to your computer and use it in GitHub Desktop.
Save oconnor663/717706d3891e4b3b6af0f2cf24911918 to your computer and use it in GitHub Desktop.
two tables in sqlite
#! /usr/bin/python3
import os
import sqlite3
import time
filename = "two_tables.db"
# Delete the db file so we can recreate it from scratch.
if os.path.exists(filename):
os.remove(filename)
# Create the new db file. At first, don't create an emails index.
print("creating {}...".format(filename))
conn = sqlite3.connect(filename)
conn.execute("""CREATE TABLE user_emails (
name TEXT PRIMARY KEY,
email TEXT
)""")
conn.execute("""CREATE TABLE recent_users (
email TEXT PRIMARY KEY,
access_time TEXT,
FOREIGN KEY (email) REFERENCES user_emails (email)
)""")
# Insert a million entries in user_emails, and a thousand in recent_users.
EMAILS_NUM = 1_000_000
print("inserting", EMAILS_NUM, "users...")
for i in range(EMAILS_NUM):
name = "user" + str(i)
email = name + "@example.com"
# Note that Python's sqlite library automatically begins a transaction for
# us the first time we insert. If it didn't do this, individual inserts
# would be very slow.
conn.execute("INSERT INTO user_emails (name, email) VALUES (?, ?)",
(name, email))
if i % 1000 == 0:
conn.execute(
"INSERT INTO recent_users (email, access_time) VALUES (?, ?)",
(email, "last Tuesday"))
conn.commit()
# IMPORTANT: Analyze the db. This gives sqlite a chance to save some metadata
# about the size of each table. If we skip this step, the query planner won't
# know that recent_users is much smaller than user_emails, and it won't choose
# the right index.
conn.execute("ANALYZE")
# Do the first SELECT, before an index had been added, and print out the time
# it took to run.
print("joining the two tables without an index...")
start = time.perf_counter()
count = conn.execute(
"SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)"
).fetchone()[0]
end = time.perf_counter()
print("joined {} rows in {:.2} seconds".format(count, end - start))
# Now, add an index on the email column in user_emails.
print("creating an emails index...")
conn.execute("CREATE INDEX email_index ON user_emails (email)")
# Finally, do one more SELECT, over the same data as before. This time it
# should be able to take advantage of the index we just created, and it should
# be much faster.
print("joining the two tables with an index...")
start = time.perf_counter()
count = conn.execute(
"SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)"
).fetchone()[0]
end = time.perf_counter()
print("joined {} rows in {:.2} seconds".format(count, end - start))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment