Skip to content

Instantly share code, notes, and snippets.

@CAFxX
Last active October 1, 2019 02:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CAFxX/592890e94dbf4e3ba40ea89ec0ee9808 to your computer and use it in GitHub Desktop.
Save CAFxX/592890e94dbf4e3ba40ea89ec0ee9808 to your computer and use it in GitHub Desktop.
quick and dirty bloom filter service for querying the haveibeenpwned dataset
package main
import (
"bufio"
"flag"
"fmt"
"net/http"
"os"
"regexp"
"strings"
"github.com/pkg/errors"
"github.com/willf/bloom"
)
/*
To build the bloom filter download and uncompress the SHA-1 haveibeenpwned
archive, then run `bloom -src haveibeenpwned.txt`. After a few minutes a file
called pwd.bloom will be created. In the default configuration, the bloom file
should be around 1.5GB.
You can then run ./bloom to start a HTTP server on :8080 and you can check if
a password exists in the dataset by sending a request to
`localhost:8080?p=<SHA-1 of password>`. If it returns 200, the password is in the
HIBP dataset. If it returns 404, it is not.
*/
var (
srcFile = flag.String("src", "", "Source file (haveibeenpwnd)")
fpProb = flag.Float64("fpprob", 0.00001, "False positive probability")
bloomFile = flag.String("bloom", "pwd.bloom", "Bloom file")
)
func main() {
flag.Parse()
if *srcFile != "" {
err := buildBloom(*srcFile, *fpProb, *bloomFile)
if err != nil {
fmt.Println(err)
}
} else {
err := serveBloom(*bloomFile)
if err != nil {
fmt.Println(err)
}
}
}
func buildBloom(srcFile string, fpProb float64, bloomFile string) error {
file, err := os.Open(srcFile)
if err != nil {
return errors.Wrap(err, "opening source file")
}
out, err := os.Create(bloomFile)
if err != nil {
return errors.Wrap(err, "creating bloom file")
}
fmt.Printf("scanning source file %q\n", srcFile)
n := uint(0)
s := bufio.NewScanner(file)
for s.Scan() {
n++
}
if err := s.Err(); err != nil {
return errors.Wrap(err, "scanning source file")
}
b := bloom.NewWithEstimates(n, fpProb)
fmt.Printf("bloom filter n=%d, p=%f, m=%d, K=%d\n", n, fpProb, b.Cap(), b.K())
_, err = file.Seek(0, os.SEEK_SET)
if err != nil {
return errors.Wrap(err, "seeking in source file")
}
s = bufio.NewScanner(file)
i := uint(0)
for s.Scan() {
b.Add(s.Bytes()[0:40])
if i%1000000 == 0 {
fmt.Printf("populating bloom filter %d/%d\r", i, n)
}
i++
}
fmt.Printf("populated bloom filter %d/%d \n", i, n)
if err := s.Err(); err != nil {
return errors.Wrap(err, "reading source file")
}
if i != n {
return errors.Errorf("line count mismatch: %d/%d", i, n)
}
fmt.Printf("writing bloom filter file %q\n", bloomFile)
if _, err := b.WriteTo(out); err != nil {
return errors.Wrap(err, "writing bloom file")
}
return nil
}
func serveBloom(bloomFile string) error {
file, err := os.Open(bloomFile)
if err != nil {
return errors.Wrapf(err, "opening bloom file")
}
fmt.Printf("reading bloom file %q\n", bloomFile)
b := &bloom.BloomFilter{}
_, err = b.ReadFrom(file)
if err != nil {
return errors.Wrap(err, "reading bloom file")
}
sha1re := regexp.MustCompile("^[0-9A-F]{40}$")
fmt.Println("starting http server on :8080")
err = http.ListenAndServe(":8080", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
p := r.URL.Query().Get("p")
p = strings.ToUpper(p)
if !sha1re.MatchString(p) {
w.WriteHeader(http.StatusBadRequest)
return
}
if b.TestString(p) {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusNotFound)
}
}))
return errors.Wrap(err, "http server")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment