Skip to content

Instantly share code, notes, and snippets.

@marcan marcan/bloom.py
Last active Oct 4, 2019

Embed
What would you like to do?
Simple Bloom filter implementation in Python 3 (for use with the HIBP password list)
#!/usr/bin/python3
#
# Simple Bloom filter implementation in Python 3
# Copyright 2017 Hector Martin "marcan" <marcan@marcan.st>
# Licensed under the terms of the MIT license
#
# Written to be used with the Have I been pwned? password list:
# https://haveibeenpwned.com/passwords
#
# Download the pre-computed filter here (968MB, k=11, false positive p=0.0005):
# https://mrcn.st/t/pwned-passwords-2.0.bloom
# Or if you're more paranoid (1.41GB, k=16, false positive p=0.000015):
# https://mrcn.st/t/pwned-passwords-2.0-k16.bloom
#
# Also works as a generic file-backed bloom filter for other purposes.
# Use something like this to work out what 'm' and 'k' you need:
# https://krisives.github.io/bloom-calculator/
# For bulk data loads you should put the bloom filter on a tmpfs/ramdisk,
# as loading directly onto a disk-backed filter is extremely slow.
#
# Examples:
# $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
# $ python bloom.py load -l passwords.bloom pwned-passwords-update-1.txt
# $ python bloom.py test -s passwords.bloom letmein
# Found
# $ python bloom.py test -s passwords.bloom si2v8jX4LG
# Not found
#
# $ python3
# >>> from hashlib import sha1
# >>> from bloom import BloomFilter
# >>> filter = bloom.BloomFilter("pwned-passwords-1.0u1.bloom")
# >>> filter.contains(sha1(b"p4ssword").hexdigest())
# True
# >>> filter.contains(sha1(b"super_secure_password").hexdigest())
# False
import os, json, mmap
from hashlib import md5
class BloomFilter(object):
ALIGN = 16384
THRESHOLD_HEADROOM = 2**16 # for uniformity, rehash after fewer bits left
def __init__(self, filename, m=None, k=None, readonly=False):
self.bits = None
if os.path.exists(filename):
self.open(filename, readonly=readonly)
elif readonly:
raise IOError("File %s not found" % filename)
elif m is None or k is None:
raise ValueError("Filter does not exist and m/k not provided")
else:
self.create(filename, m, k)
def open(self, filename, readonly=True):
self.fd = open(filename, "rb" if readonly else "r+b")
hdr = json.loads(self.fd.readline().decode("ascii"))
self.m = hdr["m"]
self.k = hdr["k"]
self.size = hdr["size"]
self.offset = hdr["offset"]
self.threshold = hdr["threshold"]
self.bits = mmap.mmap(self.fd.fileno(), self.size, offset=self.offset,
prot=(mmap.PROT_READ |
(mmap.PROT_WRITE if not readonly else 0)))
def create(self, filename, m, k):
self.fd = open(filename, "w+b")
size = (m + 7) // 8
size = (size + self.ALIGN - 1) & ~(self.ALIGN - 1)
self.m = m
self.k = k
self.size = size
self.offset = self.ALIGN
self.threshold = 1
while self.threshold < self.m:
self.threshold *= 2
self.threshold *= self.THRESHOLD_HEADROOM
hdr = {
"m": m,
"k": k,
"offset": self.ALIGN,
"size": size,
"threshold": self.threshold,
}
self.fd.write(json.dumps(hdr).encode("ascii") + b"\n")
self.fd.seek(size + self.ALIGN - 1)
self.fd.write(b"\x00")
self.fd.seek(self.ALIGN)
self.bits = mmap.mmap(self.fd.fileno(), self.size, offset=self.offset)
def hash(self, s):
capacity = 0
val = 0
if isinstance(s, str):
s = s.encode("utf-8")
for i in range(self.k):
if capacity < self.threshold:
s = md5(s).digest()
val = int.from_bytes(s, byteorder='big')
capacity = 1 << 128
h = val % self.m
val //= self.m
capacity //= self.m
yield h
def add(self, s):
for h in self.hash(s):
byte, bit = h >> 3, h & 7
self.bits[byte] |= 1 << bit
def update(self, iterable):
for s in iterable:
for h in self.hash(s):
byte, bit = h >> 3, h & 7
self.bits[byte] |= 1 << bit
def contains(self, s):
for h in self.hash(s):
byte, bit = h >> 3, h & 7
if not (self.bits[byte] & (1 << bit)):
return False
return True
def sync(self):
self.bits.flush()
def __del__(self):
if self.bits:
self.bits.flush()
self.bits.close()
self.fd.close()
if __name__ == "__main__":
import sys, argparse
from hashlib import sha1
def cmd_load(args):
filt = BloomFilter(args.filter, m=args.bits, k=args.hashes)
if args.sha1:
if args.lower:
filt.update(sha1(line.rstrip("\n").lower().encode("utf-8")).hexdigest()
for line in open(args.input))
else:
filt.update(sha1(line.rstrip("\n").encode("utf-8")).hexdigest()
for line in open(args.input))
else:
if args.lower:
filt.update(line.rstrip("\n").lower().encode("utf-8")
for line in open(args.input))
else:
filt.update(line.rstrip("\n").encode("utf-8")
for line in open(args.input))
filt.sync()
def cmd_test(args):
value = args.value.encode("utf-8")
if args.sha1:
value = sha1(value).hexdigest()
filt = BloomFilter(args.filter, readonly=True)
if filt.contains(value):
print("Found")
else:
print("Not found")
parser = argparse.ArgumentParser(prog=sys.argv[0], epilog="""
examples:
%(prog)s load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
%(prog)s load -l passwords.bloom pwned-passwords-update-1.txt
%(prog)s test -s passwords.bloom letmein
%(prog)s test -s passwords.bloom si2v8jX4LG""",
formatter_class=argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(dest="subcommand")
p_load = subparsers.add_parser('load', help='create or add to a bloom filter')
p_load.set_defaults(func=cmd_load)
p_load.add_argument("-m", "--bits", type=int, default=5033164800, help="number of bits in the filter")
p_load.add_argument("-k", "--hashes", type=int, default=11, help="number of hash functions to use")
p_load.add_argument("-l", "--lower", action="store_true", help="lowercase input data")
p_load.add_argument("-s", "--sha1", action="store_true", help="SHA-1 input data")
p_load.add_argument("filter", type=str, help='filename of the filter')
p_load.add_argument("input", type=str, help='file containing input data')
p_test = subparsers.add_parser('test', help='check whether a given value matches the bloom filter')
p_test.set_defaults(func=cmd_test)
p_test.add_argument("-s", "--sha1", action="store_true", help="SHA-1 input data")
p_test.add_argument("filter", type=str, help='filename of the filter')
p_test.add_argument("value", type=str, help='value to look up in the filter')
args = parser.parse_args()
if args.subcommand is None:
parser.print_help()
sys.exit(0)
args.func(args)
@tomviner

This comment has been minimized.

Copy link

commented Aug 5, 2017

Very cool. Noticed could the update method call the add method?

@marcan

This comment has been minimized.

Copy link
Owner Author

commented Aug 5, 2017

That's deliberate; that's a hot path so I tried to remove some levels of indirection. Possibly misguided premature optimization, since the bottleneck is probably the hash computation, but meh...

Currently, a k=16 filter with 320M entries (~924MB filter) takes 4 hours to build on my machine (on a ramdisk), which isn't terribly fast, but it's a one-time operation.

@pjsg

This comment has been minimized.

Copy link

commented Aug 28, 2017

When you created the https://mrcn.st/t/pwned-passwords-1.0u2.bloom file, did you use the --lower flag?

Actually, don't worry -- I see that you didn't. The hash for 'paSsword' is found in the filter.

@marcan

This comment has been minimized.

Copy link
Owner Author

commented Feb 22, 2018

I did, but that lowercases the hashes, since the original password list is a list of hashes. This is because python's hexdigest returns lowercase. That was actually the intent of the flag.

@ju916

This comment has been minimized.

Copy link

commented Jan 22, 2019

Great project. Thank you.

Just an idea:
How about an additional option that does an HIBP API lookup in case of a match to eliminate the risk of a false positive.
It would only send data to HIBP if you are resaonable sure, that your password is pwned anyway.

bye, ju

@lbreuss

This comment has been minimized.

Copy link

commented Jan 27, 2019

Great work.
Please enlarge ALIGN to 2**16. My Windows 7 x64 seems to align at 64k. (I managed to tweak the bloom file with HxD Hex Editor.)
And the mmap call should use the platform-independent access parameter.
See my proposals here: https://gist.github.com/lbreuss/912837a7d5b55befe7e1f4fc55ffbbf8/revisions?diff=split

@thomasmueller

This comment has been minimized.

Copy link

commented Feb 6, 2019

If you are interested, I could help you shrink the filter files around 20%. To do that, the Bloom algorithm would have to be replaced. Best would be Xor+ from my project https://github.com/FastFilter/fastfilter_cpp. If you are interested, I could work on a Python port for this. That would shrink the filter files (with the same false positive probability) to 757 MB / 1.09 GB, instead of 968 MB / 1.41 GB. Lookup speed would be about the same (maybe a bit faster); construction of the files would probably be slower (not sure how much). Let me know if you are interested.

Update: I wrote a separate tool now, see github.com/FastFilter/fastfilter_java. Creating the filter takes about 2-3 minutes, and the filter is ~630 MB.

@FelixJongleur42

This comment has been minimized.

Copy link

commented Mar 10, 2019

lbreuss wrote:

Great work.
Please enlarge ALIGN to 2**16. My Windows 7 x64 seems to align at 64k. (I managed to tweak the bloom file with HxD Hex Editor.)
And the mmap call should use the platform-independent access parameter.
See my proposals here: https://gist.github.com/lbreuss/912837a7d5b55befe7e1f4fc55ffbbf8/revisions?diff=split

Thanks for your input for porting onto Windows. I stumbled upon the same problems until I noticed your comment.
But what exactly did you do to tweak the bloom file? change "k" from 16 to 64?
I tried to tweak the file and I get errors like:
OSError: [WinError 1132] "wrong offset" (translated from German)
OSError: [WinError 8] "not enough resources" (translated from German)

@dadosch

This comment has been minimized.

Copy link

commented Mar 10, 2019

@marcan Could you put the pwned-passwords-2.0.bloom on github? On your server I can only download it with 80 KB/s which will take approx. 4 hours.

@lbreuss

This comment has been minimized.

Copy link

commented Mar 20, 2019

@FelixJongleur42 You'll have to understand the file format, take a look at the code. 1. Shift the filter data at 16kB to 64kB by inserting 48kB of zeroes. 2. Modify the offset in the file. 3. patch the script according to my proposal. Good luck.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.