timmc/pwned-passwords-sqlite-build.py

## pwned-passwords-sqlite-build.py
#!/usr/bin/env python3
# Build a SQLite3 DB for looking up SHA-1 hashes of leaked passwords.
#
# This can be fed the txt file from one of Have I Been Pwned's hash
# lists available from https://haveibeenpwned.com/Passwords -- but any
# text file with line format ``hash-hex:count`` will work.
#
# When run on the v5 hash-ordered SHA-1 file, expect the build to take
# about 35 minutes and produce a 15.7 GiB file (~30.5 bytes per record).
#
# This example shows querying the resulting database for the
# vulnerable password "qwerty123", and finding that it was present
# with a count of 621679:
#
#    >>> import sqlite3
#    >>> conn = sqlite3.connect("pwned-passwords-sha1-with-counts-v5.sqlite")
#    >>> hash_bytes = hashlib.sha1('qwerty123'.encode()).digest()
#    >>> conn.execute("SELECT * FROM hashes WHERE hash = :sha1 LIMIT 1", {'sha1': hash_bytes}).fetchone()
#    (b'\\\xec\x17[\x16^=^b\xc9\xe1<\xe8H\xefo\xea\xc8\x1b\xff', 621679)

import os
import sqlite3
import sys

def record_generator(in_path):
    with open(in_path) as hashes:
        for line in hashes:
            (sha1_hex, count_str) = line.split(':', 2)
            sha1_bytes = bytes.fromhex(sha1_hex)
            count = int(count_str)
            yield (sha1_bytes, count)

def build(in_path, out_path):
    with sqlite3.connect(out_path) as conn:
        conn.execute('pragma journal_mode=memory')
        conn.execute('CREATE TABLE hashes("hash" BLOB PRIMARY KEY, "count" INT) WITHOUT ROWID')
        conn.executemany(
            'INSERT INTO hashes(hash, count) VALUES (?, ?)',
            record_generator(in_path))
        conn.commit()

def main(*args):
    if len(args) != 2:
        print("Usage: build.py <hash-and-count-input.txt> <output.sqlite>")
        sys.exit(1)
    build(*args)

if __name__ == '__main__':
    main(*sys.argv[1:])

## pwned-passwords-sqlite.md

      
    Raw
  

              pwned-passwords-sqlite.md
            
          
    Update: Added a Python version (pwned-passwords-sqlite-build.py) that preserves counts and uses binary keys rather than text.
Last executed 2019-06-25 with the v4 dump:

Make sure you have 60 GB free disk space and some extra to
spare. Alternatively, take a walk on the wild side and delete
source files as soon as you've used them.
Download the SHA-1 (ordered by hash) torrent from
https://haveibeenpwned.com/Passwords
Unpack and strip off the counts:
7z x -so pwned-passwords-sha1-ordered-by-hash-v4.7z pwned-passwords-sha1-ordered-by-hash-v4.txt | sed 's/:.*//' > hashes.lst


Load into sqlite:
$ sqlite3 pwned-passwords-by-hash-v4.sqlite
sqlite> pragma journal_mode=memory;
sqlite> CREATE TABLE hashes("hash" BLOB PRIMARY KEY) WITHOUT ROWID;
sqlite> .mode csv
sqlite> .import hashes.lst hashes


Confirm that queries will use the primary key's index:
sqlite> EXPLAIN QUERY PLAN SELECT * FROM hashes WHERE hash = "D657187D9C9C1AD04FDA5132338D405FDB112FA1" LIMIT 1;
0|0|0|SEARCH TABLE hashes USING PRIMARY KEY (hash=?)

A sample query should return almost instantly:
sqlite> SELECT * FROM hashes WHERE hash = "D657187D9C9C1AD04FDA5132338D405FDB112FA1" LIMIT 1;
D657187D9C9C1AD04FDA5132338D405FDB112FA1


A few notes on construction:

Journaling is disabled to speed up initial import.
Making the hash the primary key tells sqlite that the data is unique
and ordered—and the primary key data is stored in a B-tree, not
duplicated into an additional index.
We can also avoid storing rowids; no idea how much space this saves
us, but no harm here: https://www.sqlite.org/withoutrowid.html
	#!/usr/bin/env python3
	# Build a SQLite3 DB for looking up SHA-1 hashes of leaked passwords.
	#
	# This can be fed the txt file from one of Have I Been Pwned's hash
	# lists available from https://haveibeenpwned.com/Passwords -- but any
	# text file with line format ``hash-hex:count`` will work.
	#
	# When run on the v5 hash-ordered SHA-1 file, expect the build to take
	# about 35 minutes and produce a 15.7 GiB file (~30.5 bytes per record).
	#
	# This example shows querying the resulting database for the
	# vulnerable password "qwerty123", and finding that it was present
	# with a count of 621679:
	#
	# >>> import sqlite3
	# >>> conn = sqlite3.connect("pwned-passwords-sha1-with-counts-v5.sqlite")
	# >>> hash_bytes = hashlib.sha1('qwerty123'.encode()).digest()
	# >>> conn.execute("SELECT * FROM hashes WHERE hash = :sha1 LIMIT 1", {'sha1': hash_bytes}).fetchone()
	# (b'\\\xec\x17[\x16^=^b\xc9\xe1<\xe8H\xefo\xea\xc8\x1b\xff', 621679)

	import os
	import sqlite3
	import sys

	def record_generator(in_path):
	with open(in_path) as hashes:
	for line in hashes:
	(sha1_hex, count_str) = line.split(':', 2)
	sha1_bytes = bytes.fromhex(sha1_hex)
	count = int(count_str)
	yield (sha1_bytes, count)

	def build(in_path, out_path):
	with sqlite3.connect(out_path) as conn:
	conn.execute('pragma journal_mode=memory')
	conn.execute('CREATE TABLE hashes("hash" BLOB PRIMARY KEY, "count" INT) WITHOUT ROWID')
	conn.executemany(
	'INSERT INTO hashes(hash, count) VALUES (?, ?)',
	record_generator(in_path))
	conn.commit()

	def main(*args):
	if len(args) != 2:
	print("Usage: build.py <hash-and-count-input.txt> <output.sqlite>")
	sys.exit(1)
	build(*args)

	if __name__ == '__main__':
	main(*sys.argv[1:])