Skip to content

Instantly share code, notes, and snippets.

@marcan
Last active February 29, 2024 19:55
Show Gist options
  • Star 64 You must be signed in to star a gist
  • Fork 15 You must be signed in to fork a gist
  • Save marcan/23e1ec416bf884dcd7f0e635ce5f2724 to your computer and use it in GitHub Desktop.
Save marcan/23e1ec416bf884dcd7f0e635ce5f2724 to your computer and use it in GitHub Desktop.
Simple Bloom filter implementation in Python 3 (for use with the HIBP password list)
#!/usr/bin/python3
#
# Simple Bloom filter implementation in Python 3
# Copyright 2017 Hector Martin "marcan" <marcan@marcan.st>
# Licensed under the terms of the MIT license
#
# Written to be used with the Have I been pwned? password list:
# https://haveibeenpwned.com/passwords
#
# Download the pre-computed filter here (968MB, k=11, false positive p=0.0005):
# https://mrcn.st/t/pwned-passwords-2.0.bloom
# Or if you're more paranoid (1.41GB, k=16, false positive p=0.000015):
# https://mrcn.st/t/pwned-passwords-2.0-k16.bloom
#
# Also works as a generic file-backed bloom filter for other purposes.
# Use something like this to work out what 'm' and 'k' you need:
# https://krisives.github.io/bloom-calculator/
# For bulk data loads you should put the bloom filter on a tmpfs/ramdisk,
# as loading directly onto a disk-backed filter is extremely slow.
#
# Examples:
# $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
# $ python bloom.py load -l passwords.bloom pwned-passwords-update-1.txt
# $ python bloom.py test -s passwords.bloom letmein
# Found
# $ python bloom.py test -s passwords.bloom si2v8jX4LG
# Not found
#
# $ python3
# >>> from hashlib import sha1
# >>> from bloom import BloomFilter
# >>> filter = bloom.BloomFilter("pwned-passwords-1.0u1.bloom")
# >>> filter.contains(sha1(b"p4ssword").hexdigest())
# True
# >>> filter.contains(sha1(b"super_secure_password").hexdigest())
# False
import os, json, mmap
from hashlib import md5
class BloomFilter(object):
ALIGN = 16384
THRESHOLD_HEADROOM = 2**16 # for uniformity, rehash after fewer bits left
def __init__(self, filename, m=None, k=None, readonly=False):
self.bits = None
if os.path.exists(filename):
self.open(filename, readonly=readonly)
elif readonly:
raise IOError("File %s not found" % filename)
elif m is None or k is None:
raise ValueError("Filter does not exist and m/k not provided")
else:
self.create(filename, m, k)
def open(self, filename, readonly=True):
self.fd = open(filename, "rb" if readonly else "r+b")
hdr = json.loads(self.fd.readline().decode("ascii"))
self.m = hdr["m"]
self.k = hdr["k"]
self.size = hdr["size"]
self.offset = hdr["offset"]
self.threshold = hdr["threshold"]
self.bits = mmap.mmap(self.fd.fileno(), self.size, offset=self.offset,
prot=(mmap.PROT_READ |
(mmap.PROT_WRITE if not readonly else 0)))
def create(self, filename, m, k):
self.fd = open(filename, "w+b")
size = (m + 7) // 8
size = (size + self.ALIGN - 1) & ~(self.ALIGN - 1)
self.m = m
self.k = k
self.size = size
self.offset = self.ALIGN
self.threshold = 1
while self.threshold < self.m:
self.threshold *= 2
self.threshold *= self.THRESHOLD_HEADROOM
hdr = {
"m": m,
"k": k,
"offset": self.ALIGN,
"size": size,
"threshold": self.threshold,
}
self.fd.write(json.dumps(hdr).encode("ascii") + b"\n")
self.fd.seek(size + self.ALIGN - 1)
self.fd.write(b"\x00")
self.fd.seek(self.ALIGN)
self.bits = mmap.mmap(self.fd.fileno(), self.size, offset=self.offset)
def hash(self, s):
capacity = 0
val = 0
if isinstance(s, str):
s = s.encode("utf-8")
for i in range(self.k):
if capacity < self.threshold:
s = md5(s).digest()
val = int.from_bytes(s, byteorder='big')
capacity = 1 << 128
h = val % self.m
val //= self.m
capacity //= self.m
yield h
def add(self, s):
for h in self.hash(s):
byte, bit = h >> 3, h & 7
self.bits[byte] |= 1 << bit
def update(self, iterable):
for s in iterable:
for h in self.hash(s):
byte, bit = h >> 3, h & 7
self.bits[byte] |= 1 << bit
def contains(self, s):
for h in self.hash(s):
byte, bit = h >> 3, h & 7
if not (self.bits[byte] & (1 << bit)):
return False
return True
def sync(self):
self.bits.flush()
def __del__(self):
if self.bits:
self.bits.flush()
self.bits.close()
self.fd.close()
if __name__ == "__main__":
import sys, argparse
from hashlib import sha1
def cmd_load(args):
filt = BloomFilter(args.filter, m=args.bits, k=args.hashes)
if args.sha1:
if args.lower:
filt.update(sha1(line.rstrip("\n").lower().encode("utf-8")).hexdigest()
for line in open(args.input))
else:
filt.update(sha1(line.rstrip("\n").encode("utf-8")).hexdigest()
for line in open(args.input))
else:
if args.lower:
filt.update(line.rstrip("\n").lower().encode("utf-8")
for line in open(args.input))
else:
filt.update(line.rstrip("\n").encode("utf-8")
for line in open(args.input))
filt.sync()
def cmd_test(args):
value = args.value.encode("utf-8")
if args.sha1:
value = sha1(value).hexdigest()
filt = BloomFilter(args.filter, readonly=True)
if filt.contains(value):
print("Found")
else:
print("Not found")
parser = argparse.ArgumentParser(prog=sys.argv[0], epilog="""
examples:
%(prog)s load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
%(prog)s load -l passwords.bloom pwned-passwords-update-1.txt
%(prog)s test -s passwords.bloom letmein
%(prog)s test -s passwords.bloom si2v8jX4LG""",
formatter_class=argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(dest="subcommand")
p_load = subparsers.add_parser('load', help='create or add to a bloom filter')
p_load.set_defaults(func=cmd_load)
p_load.add_argument("-m", "--bits", type=int, default=5033164800, help="number of bits in the filter")
p_load.add_argument("-k", "--hashes", type=int, default=11, help="number of hash functions to use")
p_load.add_argument("-l", "--lower", action="store_true", help="lowercase input data")
p_load.add_argument("-s", "--sha1", action="store_true", help="SHA-1 input data")
p_load.add_argument("filter", type=str, help='filename of the filter')
p_load.add_argument("input", type=str, help='file containing input data')
p_test = subparsers.add_parser('test', help='check whether a given value matches the bloom filter')
p_test.set_defaults(func=cmd_test)
p_test.add_argument("-s", "--sha1", action="store_true", help="SHA-1 input data")
p_test.add_argument("filter", type=str, help='filename of the filter')
p_test.add_argument("value", type=str, help='value to look up in the filter')
args = parser.parse_args()
if args.subcommand is None:
parser.print_help()
sys.exit(0)
args.func(args)
@modrobert
Copy link

modrobert commented Feb 14, 2023

When creating the bloom filter from recent pwned password files, e.g. 'pwned-passwords-sha1-ordered-by-hash-v8.txt', then the cmd_load function will parse wrong data because file lines include prevalence counter at the end, so to fix this without touching the hash file the code needs to be patched, for example:

Replace line.rstrip("\n").lower().encode("utf-8") with line.lower().encode("utf-8")[0:40] .

Then create the filter with, for example:

bloom.py load -m 19366012780 -k 16 -l passwords.bloom pwned-passwords-sha1-ordered-by-hash-v8.txt

This simple bash script pwned_check.sh (for Linux) might be useful if you want to do a complete password check against the hash file (e.g. to compare with the bloom filter):

#!/bin/bash
# by modrobert in 2023

HASH_FILE="/tmp/pwned-passwords-sha1-ordered-by-hash-v8.txt"
PASSWORD=$@

if [ $# -eq 0 ] ; then
    echo "Usage: $(echo $0 | awk -F '/' '{print $NF}') \"password\""
else
    HASH=$(echo -n "$PASSWORD" | sha1sum | awk -F ' ' '{print toupper($1)}')
    RESULT=$(grep -m 1 -F "$HASH" "$HASH_FILE" | tr -d '\r')
    if [ ! -z "$RESULT" ]; then
        echo "${RESULT}:${PASSWORD}"
    fi
fi

On a modern computer (in 2023) with SSD it takes roughly 20 seconds average to search a 35G file with 'grep -F' (using 'rg -F' aka 'ripgrep' is slower for some reason in this case).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment