Skip to content

Instantly share code, notes, and snippets.

@DanielThomas
Last active March 26, 2024 03:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DanielThomas/84ad3db09eef27af85e6bc95653e553f to your computer and use it in GitHub Desktop.
Save DanielThomas/84ad3db09eef27af85e6bc95653e553f to your computer and use it in GitHub Desktop.
A hash index for primitive numeric values in Java using a twin prime, double hashed hash table
/*
* Copyright 2024 Netflix Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netflix.index;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.LongFunction;
import java.util.function.LongPredicate;
/**
* A hash index of numeric keys and values akin to a database hash index from a key to a row number.
*
* <p> Unlike a {@link java.util.Map} multiple values for a given key are supported. They are collisions however, so have
* {@code O(n)} complexity for each additional duplicate, without considering collisions with other keys. The index does
* not store keys, the caller decides if the expected value was found for a given key with a predicate. For that reason,
* the index can not expand as filled, and insertions will degrade to traversing the entire backing array when filled.
* Care should be taken to size the index to accomodate all expected entries, including duplicates.
*
* <p> The number of bytes to be used to store the value can be specified, and can be any numeric value able to be widened
* to {@code long} supported. The largest positive value for the given byte length is reserved for internal use. Generics
* for the value type and specialized overloads for smaller value types are avoided, so in some cases the caller may have
* to narrow an argument. In those cases there is no risk of data loss as the length invariant is guaranteed at insertion
* time. This allows boxing to be avoided, while we wait out Project Valhalla.
*
* @param <T> the external type being indexed
*/
public final class HashIndex<T> {
private static final float DEFAULT_LOAD_FACTOR = 0.5f;
private static final long UNUSED = 0;
private static final List<Integer> PERMITTED_SIZES = Arrays.asList(Byte.BYTES, Short.BYTES, Integer.BYTES, Long.BYTES);
private final ByteBuffer entries;
private final int capacity;
private final int entryBytes;
/**
* Construct a new {@link HashIndex} sized to the number of entries.
*
* @param numEntries the number of expected mappings
*/
public HashIndex(int numEntries) {
this(numEntries, Byte.BYTES);
}
/**
* Construct a new {@link HashIndex} sized to the given number of entries.
*
* @param numEntries the number of entries
*/
public HashIndex(int numEntries, int entryBytes) {
this(numEntries, entryBytes, DEFAULT_LOAD_FACTOR);
}
/**
* Construct a new {@link HashIndex} sized to the given number of entries, with a {@link Byte#BYTES} entry size,
* and load factor.
*
* @param numEntries the number of expected mappings
* @param entryBytes the number of entries
* @param loadFactor the load factor
*/
public HashIndex(int numEntries, int entryBytes, float loadFactor) {
if (entryBytes > Long.BYTES) {
throw new IllegalArgumentException("maximum alignment size is " + Long.BYTES + " bytes");
}
if (!PERMITTED_SIZES.contains(entryBytes)) {
throw new IllegalArgumentException("entry size must be one of: " + PERMITTED_SIZES);
}
this.entryBytes = entryBytes;
this.capacity = twinnedPrimeTableSizeFor(numEntries, loadFactor);
long bufferCapacity = capacity * (long) entryBytes;
if (bufferCapacity > Integer.MAX_VALUE) {
throw new IllegalArgumentException("capacity exceeds array limit");
}
entries = ByteBuffer.allocate((int) bufferCapacity);
}
/**
* Returns the value that a given key maps to, using a {@link LongFunction} to map the return type to an
* {@link Optional} of {@link T}. A present {@link Optional} indicates a match, causing it to be returned. Returning
* {@link Optional#empty()} indicates the value was not a match, and the operation should continue until known to be
* absent.
*
* @param key the key to find a value for
* @param valueMapper a mapper from a value to a return type of {@link T}
* @return the optional result
*/
public Optional<T> get(long key, LongFunction<Optional<T>> valueMapper) {
AtomicReference<Optional<T>> reference = new AtomicReference<>(Optional.empty());
OptionalNumber value = getValue(key, candidate -> {
Optional<T> result = valueMapper.apply(candidate);
if (result.isPresent()) {
reference.set(result);
}
return result.isPresent();
});
return value.isPresent ? reference.get() : Optional.empty();
}
/**
* Returns the value that the given key is mapped to, with equality determined by {@link LongPredicate}. The return
* value is an {@link OptionalNumber} rather than {@link LongPredicate} to avoid accidental narrowing of the return
* value by the caller.
*
* @param key the key
* @param valueEquals a predicate to test if a value is considered equal for the given key
* @return the optional result
*/
public OptionalNumber getValue(long key, LongPredicate valueEquals) {
int idx = probe(key, valueEquals);
if (idx >= 0) {
long value = get(idx * entryBytes);
value = value > 0 ? value - 1 : value;
return OptionalNumber.of(value);
}
return OptionalNumber.empty();
}
/**
* Insert a value into the index for a given key.
*
* @param key the key to be hashed for index insertion
* @param value the value to be inserted
* @throws IndexOutOfBoundsException if the backing array is full
*/
public void put(long key, long value) {
int entryBits = entryBytes * Byte.SIZE;
if (Long.numberOfTrailingZeros(~value) + 1 == entryBits) {
throw new IllegalArgumentException(value + " is reserved for " + entryBytes + " byte entries");
}
checkNarrowingNotLossy(value, entryBits);
long storedValue = value >= 0 ? value + 1 : value;
int idx = probe(key, existing -> existing == storedValue);
if (idx == -1) {
throw new IndexOutOfBoundsException("the table is full");
} else if (idx < 0) {
idx = -(idx + 2);
int index = idx * entryBytes;
switch (entryBits) {
case Byte.SIZE: {
entries.put(index, (byte) storedValue);
break;
}
case Short.SIZE: {
entries.putShort(index, (short) storedValue);
break;
}
case Integer.SIZE: {
entries.putInt(index, (int) storedValue);
break;
}
case Long.SIZE: {
entries.putLong(index, storedValue);
break;
}
default: {
throw new IllegalArgumentException();
}
}
}
}
private long get(int i) {
int entryBits = entryBytes * Byte.SIZE;
long value;
switch (entryBits) {
case Byte.SIZE: {
value = entries.get(i);
break;
}
case Short.SIZE: {
value = entries.getShort(i);
break;
}
case Integer.SIZE: {
value = entries.getInt(i);
break;
}
case Long.SIZE: {
value = entries.getLong(i);
break;
}
default:
throw new IllegalStateException();
}
return value;
}
// Twin prime double hashing as suggested by Knuth. See Data Structures and Other Objects Using C++ (4th Edition), Page 614
private int probe(long key, LongPredicate valueEquals) {
int idx = index(key);
int attempts = 0;
while (attempts < capacity) {
long value = get(idx * entryBytes);
if (value == UNUSED) {
return -idx - 2;
} else if (valueEquals.test(value > 0 ? value - 1 : value)) {
return idx;
}
idx = Math.floorMod(idx + offset(key), capacity);
attempts++;
}
return -1;
}
private int index(long key) {
return Math.floorMod(Long.hashCode(key), capacity);
}
private int offset(long key) {
return 1 + (Math.floorMod(Long.hashCode(key), (capacity - 2)));
}
private int twinnedPrimeTableSizeFor(int numEntries, float loadFactor) {
int size = (int) Math.ceil(numEntries / (double) loadFactor);
BigInteger twin = BigInteger.valueOf(size).nextProbablePrime();
BigInteger candidate = twin.nextProbablePrime();
while (candidate.subtract(twin).intValue() != 2
|| !(isPrime(twin.intValue()) && isPrime(candidate.intValue()))) {
twin = candidate;
candidate = twin.nextProbablePrime();
}
return candidate.intValue();
}
private static boolean isPrime(int candidate) {
if ((candidate & 1) != 0) {
int limit = (int) Math.sqrt(candidate);
for (int divisor = 3; divisor <= limit; divisor += 2) {
if ((candidate % divisor) == 0) {
return false;
}
}
return true;
}
return candidate == 2;
}
private static void checkNarrowingNotLossy(long value, int entryBits) {
if (entryBits == Long.SIZE) {
return;
}
long minValue = -(1L << entryBits - 1);
long maxValue = ~minValue;
if (value < minValue || value > maxValue) {
throw new IllegalArgumentException("value is larger than " + entryBits + " bits");
}
}
/**
* An equivalent to {@link java.util.OptionalLong}, but ensures that data isn't lost when values are narrowed.
*/
public static final class OptionalNumber {
private static final OptionalNumber EMPTY = new OptionalNumber();
private final boolean isPresent;
private final long value;
private OptionalNumber() {
this.isPresent = false;
this.value = 0;
}
public static OptionalNumber empty() {
return EMPTY;
}
private OptionalNumber(long value) {
this.isPresent = true;
this.value = value;
}
public static OptionalNumber of(long value) {
return new OptionalNumber(value);
}
/**
* Returns if a value is present.
*
* @return {@code true} if a value is present
*/
public boolean isPresent() {
return isPresent;
}
/**
* Get the returned value if present as a {@code byte}.
*
* @return the value if present
* @throws NoSuchElementException if no value is present
*/
public byte getAsByte() {
checkNarrowingNotLossy(value, Byte.SIZE);
return (byte) get();
}
/**
* Get the returned value if present as a {@code short}.
*
* @return the value if present
* @throws NoSuchElementException if no value is present
*/
public short getAsShort() {
checkNarrowingNotLossy(value, Short.SIZE);
return (short) get();
}
/**
* Get the returned value if present as a {@code int}.
*
* @return the value if present
* @throws NoSuchElementException if no value is present
*/
public int getAsInt() {
checkNarrowingNotLossy(value, Integer.SIZE);
return (int) get();
}
/**
* Get the returned value if present.
*
* @return the value if present
* @throws NoSuchElementException if no value is present
*/
public long get() {
if (!isPresent) {
throw new NoSuchElementException("No value present");
}
return value;
}
}
}
@DanielThomas
Copy link
Author

DanielThomas commented Mar 25, 2024

I couldn't find any prior art for a hash index that only stores values and used external equality. The default load factor of 0.5 provides excellent performance, but 0.75 is a fine tradeoff for space, but will degrade significantly as you approach 1.0. If you're only storing positive values, you can double your value space by subtracting MIN_VALUE for the type from your value.

Tests for the class are available in a separate Gist.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment