aherrmann/bazel-hash-test.sh

## bazel-hash-test.sh
#!/usr/bin/env bash

# This script tests Bazel's builtin hash function to determine if it is
# sufficient to disambiguate Stackage package names that only differ by case
# on a case insensitive file system.

# Fetch the lts-12.4 snapshot and extract the package names into a Bazel readable format.
wget https://raw.githubusercontent.com/fpco/lts-haskell/master/lts-12.4.yaml
yaml2json < lts-12.4.yaml | jq -r '.packages|keys|.[]' > lts-12.4-package-names

# Multiply the number of package names by a hundred, choose random
# capitalization for each and deduplicate the resulting list of names. The
# output is a file containing a large number of package names that only differ
# by case.
cat >script.py <<EOF
import random
import sys
BLOAT = 99
def random_case(s):
    out = ""
    for c in s:
        out += random.choice([c.lower(), c.upper()])
    return out
def print_item(s):
    print('  "%s",'%s)
for line in sys.stdin.readlines():
    line = line.rstrip()
    print_item(line)
    for i in range(BLOAT):
        print_item(random_case(line))
EOF

cat lts-12.4-package-names | python3 script.py | sort -u | cat <(echo "names = [") - <(echo "]") > random-names.bzl

# Setup a Bazel workspace to test the effect of Bazel's hash function.
# Read the list of randomly capitalized package names and mangle each name into
# a case insensitive format: <lower case name>-<hash of name>.

touch WORKSPACE

cat > hash.bzl <<EOF
load("//:random-names.bzl", "names")
def _hash_names_impl(ctx):
    hashed = [
        "{}-{}".format(n.lower(), hash(n))
        for n in ctx.attr.names
    ]
    out = ctx.actions.declare_file("hashed_names.txt")
    ctx.actions.write(out, "\n".join(hashed))
    return [DefaultInfo(files = depset([out]))]
hash_names = rule(
    implementation = _hash_names_impl,
    attrs = {
        "names": attr.string_list(
            default = names,
        ),
    },
)
EOF

cat > BUILD <<EOF
load("//:hash.bzl", "hash_names")
hash_names(
    name = "hashed_names",
)
EOF

bazel build //:hashed_names

# Compare the number of randomly capitalized package names to the number of
# distinct case insensitive package names. If these numbers don't match, then
# Bazel's hash function produces a hash collision on the input set.

num_in=$(wc -l < random-names.bzl)
num_out=$(sort -u bazel-bin/hashed_names.txt | wc -l)
# The input file contains two extra lines for the Bazel list delimiters.
if [[ $((num_in - 2)) -eq $num_out ]]; then echo "all good"; else "hash collision"; return 1; fi
	#!/usr/bin/env bash

	# This script tests Bazel's builtin hash function to determine if it is
	# sufficient to disambiguate Stackage package names that only differ by case
	# on a case insensitive file system.

	# Fetch the lts-12.4 snapshot and extract the package names into a Bazel readable format.
	wget https://raw.githubusercontent.com/fpco/lts-haskell/master/lts-12.4.yaml
	yaml2json < lts-12.4.yaml \| jq -r '.packages\|keys\|.[]' > lts-12.4-package-names

	# Multiply the number of package names by a hundred, choose random
	# capitalization for each and deduplicate the resulting list of names. The
	# output is a file containing a large number of package names that only differ
	# by case.
	cat >script.py <<EOF
	import random
	import sys
	BLOAT = 99
	def random_case(s):
	out = ""
	for c in s:
	out += random.choice([c.lower(), c.upper()])
	return out
	def print_item(s):
	print(' "%s",'%s)
	for line in sys.stdin.readlines():
	line = line.rstrip()
	print_item(line)
	for i in range(BLOAT):
	print_item(random_case(line))
	EOF

	cat lts-12.4-package-names \| python3 script.py \| sort -u \| cat <(echo "names = [") - <(echo "]") > random-names.bzl

	# Setup a Bazel workspace to test the effect of Bazel's hash function.
	# Read the list of randomly capitalized package names and mangle each name into
	# a case insensitive format: <lower case name>-<hash of name>.

	touch WORKSPACE

	cat > hash.bzl <<EOF
	load("//:random-names.bzl", "names")
	def _hash_names_impl(ctx):
	hashed = [
	"{}-{}".format(n.lower(), hash(n))
	for n in ctx.attr.names
	]
	out = ctx.actions.declare_file("hashed_names.txt")
	ctx.actions.write(out, "\n".join(hashed))
	return [DefaultInfo(files = depset([out]))]
	hash_names = rule(
	implementation = _hash_names_impl,
	attrs = {
	"names": attr.string_list(
	default = names,
	),
	},
	)
	EOF

	cat > BUILD <<EOF
	load("//:hash.bzl", "hash_names")
	hash_names(
	name = "hashed_names",
	)
	EOF

	bazel build //:hashed_names

	# Compare the number of randomly capitalized package names to the number of
	# distinct case insensitive package names. If these numbers don't match, then
	# Bazel's hash function produces a hash collision on the input set.

	num_in=$(wc -l < random-names.bzl)
	num_out=$(sort -u bazel-bin/hashed_names.txt \| wc -l)
	# The input file contains two extra lines for the Bazel list delimiters.
	if [[ $((num_in - 2)) -eq $num_out ]]; then echo "all good"; else "hash collision"; return 1; fi