Created
December 7, 2018 09:50
-
-
Save aherrmann/3fee352d1d5f82cb5c0ce18d597fd43d to your computer and use it in GitHub Desktop.
Test Bazel's hash function for hash collisions on artificially inflated set of Stackage package names that only differ by case.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# This script tests Bazel's builtin hash function to determine if it is | |
# sufficient to disambiguate Stackage package names that only differ by case | |
# on a case insensitive file system. | |
# Fetch the lts-12.4 snapshot and extract the package names into a Bazel readable format. | |
wget https://raw.githubusercontent.com/fpco/lts-haskell/master/lts-12.4.yaml | |
yaml2json < lts-12.4.yaml | jq -r '.packages|keys|.[]' > lts-12.4-package-names | |
# Multiply the number of package names by a hundred, choose random | |
# capitalization for each and deduplicate the resulting list of names. The | |
# output is a file containing a large number of package names that only differ | |
# by case. | |
cat >script.py <<EOF | |
import random | |
import sys | |
BLOAT = 99 | |
def random_case(s): | |
out = "" | |
for c in s: | |
out += random.choice([c.lower(), c.upper()]) | |
return out | |
def print_item(s): | |
print(' "%s",'%s) | |
for line in sys.stdin.readlines(): | |
line = line.rstrip() | |
print_item(line) | |
for i in range(BLOAT): | |
print_item(random_case(line)) | |
EOF | |
cat lts-12.4-package-names | python3 script.py | sort -u | cat <(echo "names = [") - <(echo "]") > random-names.bzl | |
# Setup a Bazel workspace to test the effect of Bazel's hash function. | |
# Read the list of randomly capitalized package names and mangle each name into | |
# a case insensitive format: <lower case name>-<hash of name>. | |
touch WORKSPACE | |
cat > hash.bzl <<EOF | |
load("//:random-names.bzl", "names") | |
def _hash_names_impl(ctx): | |
hashed = [ | |
"{}-{}".format(n.lower(), hash(n)) | |
for n in ctx.attr.names | |
] | |
out = ctx.actions.declare_file("hashed_names.txt") | |
ctx.actions.write(out, "\n".join(hashed)) | |
return [DefaultInfo(files = depset([out]))] | |
hash_names = rule( | |
implementation = _hash_names_impl, | |
attrs = { | |
"names": attr.string_list( | |
default = names, | |
), | |
}, | |
) | |
EOF | |
cat > BUILD <<EOF | |
load("//:hash.bzl", "hash_names") | |
hash_names( | |
name = "hashed_names", | |
) | |
EOF | |
bazel build //:hashed_names | |
# Compare the number of randomly capitalized package names to the number of | |
# distinct case insensitive package names. If these numbers don't match, then | |
# Bazel's hash function produces a hash collision on the input set. | |
num_in=$(wc -l < random-names.bzl) | |
num_out=$(sort -u bazel-bin/hashed_names.txt | wc -l) | |
# The input file contains two extra lines for the Bazel list delimiters. | |
if [[ $((num_in - 2)) -eq $num_out ]]; then echo "all good"; else "hash collision"; return 1; fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment