Last active
October 1, 2019 02:19
-
-
Save CAFxX/592890e94dbf4e3ba40ea89ec0ee9808 to your computer and use it in GitHub Desktop.
quick and dirty bloom filter service for querying the haveibeenpwned dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"flag" | |
"fmt" | |
"net/http" | |
"os" | |
"regexp" | |
"strings" | |
"github.com/pkg/errors" | |
"github.com/willf/bloom" | |
) | |
/* | |
To build the bloom filter download and uncompress the SHA-1 haveibeenpwned | |
archive, then run `bloom -src haveibeenpwned.txt`. After a few minutes a file | |
called pwd.bloom will be created. In the default configuration, the bloom file | |
should be around 1.5GB. | |
You can then run ./bloom to start a HTTP server on :8080 and you can check if | |
a password exists in the dataset by sending a request to | |
`localhost:8080?p=<SHA-1 of password>`. If it returns 200, the password is in the | |
HIBP dataset. If it returns 404, it is not. | |
*/ | |
var ( | |
srcFile = flag.String("src", "", "Source file (haveibeenpwnd)") | |
fpProb = flag.Float64("fpprob", 0.00001, "False positive probability") | |
bloomFile = flag.String("bloom", "pwd.bloom", "Bloom file") | |
) | |
func main() { | |
flag.Parse() | |
if *srcFile != "" { | |
err := buildBloom(*srcFile, *fpProb, *bloomFile) | |
if err != nil { | |
fmt.Println(err) | |
} | |
} else { | |
err := serveBloom(*bloomFile) | |
if err != nil { | |
fmt.Println(err) | |
} | |
} | |
} | |
func buildBloom(srcFile string, fpProb float64, bloomFile string) error { | |
file, err := os.Open(srcFile) | |
if err != nil { | |
return errors.Wrap(err, "opening source file") | |
} | |
out, err := os.Create(bloomFile) | |
if err != nil { | |
return errors.Wrap(err, "creating bloom file") | |
} | |
fmt.Printf("scanning source file %q\n", srcFile) | |
n := uint(0) | |
s := bufio.NewScanner(file) | |
for s.Scan() { | |
n++ | |
} | |
if err := s.Err(); err != nil { | |
return errors.Wrap(err, "scanning source file") | |
} | |
b := bloom.NewWithEstimates(n, fpProb) | |
fmt.Printf("bloom filter n=%d, p=%f, m=%d, K=%d\n", n, fpProb, b.Cap(), b.K()) | |
_, err = file.Seek(0, os.SEEK_SET) | |
if err != nil { | |
return errors.Wrap(err, "seeking in source file") | |
} | |
s = bufio.NewScanner(file) | |
i := uint(0) | |
for s.Scan() { | |
b.Add(s.Bytes()[0:40]) | |
if i%1000000 == 0 { | |
fmt.Printf("populating bloom filter %d/%d\r", i, n) | |
} | |
i++ | |
} | |
fmt.Printf("populated bloom filter %d/%d \n", i, n) | |
if err := s.Err(); err != nil { | |
return errors.Wrap(err, "reading source file") | |
} | |
if i != n { | |
return errors.Errorf("line count mismatch: %d/%d", i, n) | |
} | |
fmt.Printf("writing bloom filter file %q\n", bloomFile) | |
if _, err := b.WriteTo(out); err != nil { | |
return errors.Wrap(err, "writing bloom file") | |
} | |
return nil | |
} | |
func serveBloom(bloomFile string) error { | |
file, err := os.Open(bloomFile) | |
if err != nil { | |
return errors.Wrapf(err, "opening bloom file") | |
} | |
fmt.Printf("reading bloom file %q\n", bloomFile) | |
b := &bloom.BloomFilter{} | |
_, err = b.ReadFrom(file) | |
if err != nil { | |
return errors.Wrap(err, "reading bloom file") | |
} | |
sha1re := regexp.MustCompile("^[0-9A-F]{40}$") | |
fmt.Println("starting http server on :8080") | |
err = http.ListenAndServe(":8080", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { | |
p := r.URL.Query().Get("p") | |
p = strings.ToUpper(p) | |
if !sha1re.MatchString(p) { | |
w.WriteHeader(http.StatusBadRequest) | |
return | |
} | |
if b.TestString(p) { | |
w.WriteHeader(http.StatusOK) | |
} else { | |
w.WriteHeader(http.StatusNotFound) | |
} | |
})) | |
return errors.Wrap(err, "http server") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment