oconnor663/two_tables.py

## two_tables.py
#! /usr/bin/python3

import os
import sqlite3
import time

filename = "two_tables.db"

# Delete the db file so we can recreate it from scratch.
if os.path.exists(filename):
    os.remove(filename)

# Create the new db file. At first, don't create an emails index.
print("creating {}...".format(filename))
conn = sqlite3.connect(filename)
conn.execute("""CREATE TABLE user_emails (
                  name TEXT PRIMARY KEY,
                  email TEXT
                )""")
conn.execute("""CREATE TABLE recent_users (
                  email TEXT PRIMARY KEY,
                  access_time TEXT,
                  FOREIGN KEY (email) REFERENCES user_emails (email)
                )""")

# Insert a million entries in user_emails, and a thousand in recent_users.
EMAILS_NUM = 1_000_000
print("inserting", EMAILS_NUM, "users...")
for i in range(EMAILS_NUM):
    name = "user" + str(i)
    email = name + "@example.com"
    # Note that Python's sqlite library automatically begins a transaction for
    # us the first time we insert. If it didn't do this, individual inserts
    # would be very slow.
    conn.execute("INSERT INTO user_emails (name, email) VALUES (?, ?)",
                 (name, email))
    if i % 1000 == 0:
        conn.execute(
            "INSERT INTO recent_users (email, access_time) VALUES (?, ?)",
            (email, "last Tuesday"))
conn.commit()

# IMPORTANT: Analyze the db. This gives sqlite a chance to save some metadata
# about the size of each table. If we skip this step, the query planner won't
# know that recent_users is much smaller than user_emails, and it won't choose
# the right index.
conn.execute("ANALYZE")

# Do the first SELECT, before an index had been added, and print out the time
# it took to run.
print("joining the two tables without an index...")
start = time.perf_counter()
count = conn.execute(
    "SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)"
).fetchone()[0]
end = time.perf_counter()
print("joined {} rows in {:.2} seconds".format(count, end - start))

# Now, add an index on the email column in user_emails.
print("creating an emails index...")
conn.execute("CREATE INDEX email_index ON user_emails (email)")

# Finally, do one more SELECT, over the same data as before. This time it
# should be able to take advantage of the index we just created, and it should
# be much faster.
print("joining the two tables with an index...")
start = time.perf_counter()
count = conn.execute(
    "SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)"
).fetchone()[0]
end = time.perf_counter()
print("joined {} rows in {:.2} seconds".format(count, end - start))
	#! /usr/bin/python3

	import os
	import sqlite3
	import time

	filename = "two_tables.db"

	# Delete the db file so we can recreate it from scratch.
	if os.path.exists(filename):
	os.remove(filename)

	# Create the new db file. At first, don't create an emails index.
	print("creating {}...".format(filename))
	conn = sqlite3.connect(filename)
	conn.execute("""CREATE TABLE user_emails (
	name TEXT PRIMARY KEY,
	email TEXT
	)""")
	conn.execute("""CREATE TABLE recent_users (
	email TEXT PRIMARY KEY,
	access_time TEXT,
	FOREIGN KEY (email) REFERENCES user_emails (email)
	)""")

	# Insert a million entries in user_emails, and a thousand in recent_users.
	EMAILS_NUM = 1_000_000
	print("inserting", EMAILS_NUM, "users...")
	for i in range(EMAILS_NUM):
	name = "user" + str(i)
	email = name + "@example.com"
	# Note that Python's sqlite library automatically begins a transaction for
	# us the first time we insert. If it didn't do this, individual inserts
	# would be very slow.
	conn.execute("INSERT INTO user_emails (name, email) VALUES (?, ?)",
	(name, email))
	if i % 1000 == 0:
	conn.execute(
	"INSERT INTO recent_users (email, access_time) VALUES (?, ?)",
	(email, "last Tuesday"))
	conn.commit()

	# IMPORTANT: Analyze the db. This gives sqlite a chance to save some metadata
	# about the size of each table. If we skip this step, the query planner won't
	# know that recent_users is much smaller than user_emails, and it won't choose
	# the right index.
	conn.execute("ANALYZE")

	# Do the first SELECT, before an index had been added, and print out the time
	# it took to run.
	print("joining the two tables without an index...")
	start = time.perf_counter()
	count = conn.execute(
	"SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)"
	).fetchone()[0]
	end = time.perf_counter()
	print("joined {} rows in {:.2} seconds".format(count, end - start))

	# Now, add an index on the email column in user_emails.
	print("creating an emails index...")
	conn.execute("CREATE INDEX email_index ON user_emails (email)")

	# Finally, do one more SELECT, over the same data as before. This time it
	# should be able to take advantage of the index we just created, and it should
	# be much faster.
	print("joining the two tables with an index...")
	start = time.perf_counter()
	count = conn.execute(
	"SELECT COUNT(*) FROM recent_users JOIN user_emails USING (email)"
	).fetchone()[0]
	end = time.perf_counter()
	print("joined {} rows in {:.2} seconds".format(count, end - start))