Last active
March 26, 2024 03:37
-
-
Save DanielThomas/84ad3db09eef27af85e6bc95653e553f to your computer and use it in GitHub Desktop.
A hash index for primitive numeric values in Java using a twin prime, double hashed hash table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 2024 Netflix Inc. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package com.netflix.index; | |
import java.math.BigInteger; | |
import java.nio.ByteBuffer; | |
import java.util.Arrays; | |
import java.util.List; | |
import java.util.NoSuchElementException; | |
import java.util.Optional; | |
import java.util.concurrent.atomic.AtomicReference; | |
import java.util.function.LongFunction; | |
import java.util.function.LongPredicate; | |
/** | |
* A hash index of numeric keys and values akin to a database hash index from a key to a row number. | |
* | |
* <p> Unlike a {@link java.util.Map} multiple values for a given key are supported. They are collisions however, so have | |
* {@code O(n)} complexity for each additional duplicate, without considering collisions with other keys. The index does | |
* not store keys, the caller decides if the expected value was found for a given key with a predicate. For that reason, | |
* the index can not expand as filled, and insertions will degrade to traversing the entire backing array when filled. | |
* Care should be taken to size the index to accomodate all expected entries, including duplicates. | |
* | |
* <p> The number of bytes to be used to store the value can be specified, and can be any numeric value able to be widened | |
* to {@code long} supported. The largest positive value for the given byte length is reserved for internal use. Generics | |
* for the value type and specialized overloads for smaller value types are avoided, so in some cases the caller may have | |
* to narrow an argument. In those cases there is no risk of data loss as the length invariant is guaranteed at insertion | |
* time. This allows boxing to be avoided, while we wait out Project Valhalla. | |
* | |
* @param <T> the external type being indexed | |
*/ | |
public final class HashIndex<T> { | |
private static final float DEFAULT_LOAD_FACTOR = 0.5f; | |
private static final long UNUSED = 0; | |
private static final List<Integer> PERMITTED_SIZES = Arrays.asList(Byte.BYTES, Short.BYTES, Integer.BYTES, Long.BYTES); | |
private final ByteBuffer entries; | |
private final int capacity; | |
private final int entryBytes; | |
/** | |
* Construct a new {@link HashIndex} sized to the number of entries. | |
* | |
* @param numEntries the number of expected mappings | |
*/ | |
public HashIndex(int numEntries) { | |
this(numEntries, Byte.BYTES); | |
} | |
/** | |
* Construct a new {@link HashIndex} sized to the given number of entries. | |
* | |
* @param numEntries the number of entries | |
*/ | |
public HashIndex(int numEntries, int entryBytes) { | |
this(numEntries, entryBytes, DEFAULT_LOAD_FACTOR); | |
} | |
/** | |
* Construct a new {@link HashIndex} sized to the given number of entries, with a {@link Byte#BYTES} entry size, | |
* and load factor. | |
* | |
* @param numEntries the number of expected mappings | |
* @param entryBytes the number of entries | |
* @param loadFactor the load factor | |
*/ | |
public HashIndex(int numEntries, int entryBytes, float loadFactor) { | |
if (entryBytes > Long.BYTES) { | |
throw new IllegalArgumentException("maximum alignment size is " + Long.BYTES + " bytes"); | |
} | |
if (!PERMITTED_SIZES.contains(entryBytes)) { | |
throw new IllegalArgumentException("entry size must be one of: " + PERMITTED_SIZES); | |
} | |
this.entryBytes = entryBytes; | |
this.capacity = twinnedPrimeTableSizeFor(numEntries, loadFactor); | |
long bufferCapacity = capacity * (long) entryBytes; | |
if (bufferCapacity > Integer.MAX_VALUE) { | |
throw new IllegalArgumentException("capacity exceeds array limit"); | |
} | |
entries = ByteBuffer.allocate((int) bufferCapacity); | |
} | |
/** | |
* Returns the value that a given key maps to, using a {@link LongFunction} to map the return type to an | |
* {@link Optional} of {@link T}. A present {@link Optional} indicates a match, causing it to be returned. Returning | |
* {@link Optional#empty()} indicates the value was not a match, and the operation should continue until known to be | |
* absent. | |
* | |
* @param key the key to find a value for | |
* @param valueMapper a mapper from a value to a return type of {@link T} | |
* @return the optional result | |
*/ | |
public Optional<T> get(long key, LongFunction<Optional<T>> valueMapper) { | |
AtomicReference<Optional<T>> reference = new AtomicReference<>(Optional.empty()); | |
OptionalNumber value = getValue(key, candidate -> { | |
Optional<T> result = valueMapper.apply(candidate); | |
if (result.isPresent()) { | |
reference.set(result); | |
} | |
return result.isPresent(); | |
}); | |
return value.isPresent ? reference.get() : Optional.empty(); | |
} | |
/** | |
* Returns the value that the given key is mapped to, with equality determined by {@link LongPredicate}. The return | |
* value is an {@link OptionalNumber} rather than {@link LongPredicate} to avoid accidental narrowing of the return | |
* value by the caller. | |
* | |
* @param key the key | |
* @param valueEquals a predicate to test if a value is considered equal for the given key | |
* @return the optional result | |
*/ | |
public OptionalNumber getValue(long key, LongPredicate valueEquals) { | |
int idx = probe(key, valueEquals); | |
if (idx >= 0) { | |
long value = get(idx * entryBytes); | |
value = value > 0 ? value - 1 : value; | |
return OptionalNumber.of(value); | |
} | |
return OptionalNumber.empty(); | |
} | |
/** | |
* Insert a value into the index for a given key. | |
* | |
* @param key the key to be hashed for index insertion | |
* @param value the value to be inserted | |
* @throws IndexOutOfBoundsException if the backing array is full | |
*/ | |
public void put(long key, long value) { | |
int entryBits = entryBytes * Byte.SIZE; | |
if (Long.numberOfTrailingZeros(~value) + 1 == entryBits) { | |
throw new IllegalArgumentException(value + " is reserved for " + entryBytes + " byte entries"); | |
} | |
checkNarrowingNotLossy(value, entryBits); | |
long storedValue = value >= 0 ? value + 1 : value; | |
int idx = probe(key, existing -> existing == storedValue); | |
if (idx == -1) { | |
throw new IndexOutOfBoundsException("the table is full"); | |
} else if (idx < 0) { | |
idx = -(idx + 2); | |
int index = idx * entryBytes; | |
switch (entryBits) { | |
case Byte.SIZE: { | |
entries.put(index, (byte) storedValue); | |
break; | |
} | |
case Short.SIZE: { | |
entries.putShort(index, (short) storedValue); | |
break; | |
} | |
case Integer.SIZE: { | |
entries.putInt(index, (int) storedValue); | |
break; | |
} | |
case Long.SIZE: { | |
entries.putLong(index, storedValue); | |
break; | |
} | |
default: { | |
throw new IllegalArgumentException(); | |
} | |
} | |
} | |
} | |
private long get(int i) { | |
int entryBits = entryBytes * Byte.SIZE; | |
long value; | |
switch (entryBits) { | |
case Byte.SIZE: { | |
value = entries.get(i); | |
break; | |
} | |
case Short.SIZE: { | |
value = entries.getShort(i); | |
break; | |
} | |
case Integer.SIZE: { | |
value = entries.getInt(i); | |
break; | |
} | |
case Long.SIZE: { | |
value = entries.getLong(i); | |
break; | |
} | |
default: | |
throw new IllegalStateException(); | |
} | |
return value; | |
} | |
// Twin prime double hashing as suggested by Knuth. See Data Structures and Other Objects Using C++ (4th Edition), Page 614 | |
private int probe(long key, LongPredicate valueEquals) { | |
int idx = index(key); | |
int attempts = 0; | |
while (attempts < capacity) { | |
long value = get(idx * entryBytes); | |
if (value == UNUSED) { | |
return -idx - 2; | |
} else if (valueEquals.test(value > 0 ? value - 1 : value)) { | |
return idx; | |
} | |
idx = Math.floorMod(idx + offset(key), capacity); | |
attempts++; | |
} | |
return -1; | |
} | |
private int index(long key) { | |
return Math.floorMod(Long.hashCode(key), capacity); | |
} | |
private int offset(long key) { | |
return 1 + (Math.floorMod(Long.hashCode(key), (capacity - 2))); | |
} | |
private int twinnedPrimeTableSizeFor(int numEntries, float loadFactor) { | |
int size = (int) Math.ceil(numEntries / (double) loadFactor); | |
BigInteger twin = BigInteger.valueOf(size).nextProbablePrime(); | |
BigInteger candidate = twin.nextProbablePrime(); | |
while (candidate.subtract(twin).intValue() != 2 | |
|| !(isPrime(twin.intValue()) && isPrime(candidate.intValue()))) { | |
twin = candidate; | |
candidate = twin.nextProbablePrime(); | |
} | |
return candidate.intValue(); | |
} | |
private static boolean isPrime(int candidate) { | |
if ((candidate & 1) != 0) { | |
int limit = (int) Math.sqrt(candidate); | |
for (int divisor = 3; divisor <= limit; divisor += 2) { | |
if ((candidate % divisor) == 0) { | |
return false; | |
} | |
} | |
return true; | |
} | |
return candidate == 2; | |
} | |
private static void checkNarrowingNotLossy(long value, int entryBits) { | |
if (entryBits == Long.SIZE) { | |
return; | |
} | |
long minValue = -(1L << entryBits - 1); | |
long maxValue = ~minValue; | |
if (value < minValue || value > maxValue) { | |
throw new IllegalArgumentException("value is larger than " + entryBits + " bits"); | |
} | |
} | |
/** | |
* An equivalent to {@link java.util.OptionalLong}, but ensures that data isn't lost when values are narrowed. | |
*/ | |
public static final class OptionalNumber { | |
private static final OptionalNumber EMPTY = new OptionalNumber(); | |
private final boolean isPresent; | |
private final long value; | |
private OptionalNumber() { | |
this.isPresent = false; | |
this.value = 0; | |
} | |
public static OptionalNumber empty() { | |
return EMPTY; | |
} | |
private OptionalNumber(long value) { | |
this.isPresent = true; | |
this.value = value; | |
} | |
public static OptionalNumber of(long value) { | |
return new OptionalNumber(value); | |
} | |
/** | |
* Returns if a value is present. | |
* | |
* @return {@code true} if a value is present | |
*/ | |
public boolean isPresent() { | |
return isPresent; | |
} | |
/** | |
* Get the returned value if present as a {@code byte}. | |
* | |
* @return the value if present | |
* @throws NoSuchElementException if no value is present | |
*/ | |
public byte getAsByte() { | |
checkNarrowingNotLossy(value, Byte.SIZE); | |
return (byte) get(); | |
} | |
/** | |
* Get the returned value if present as a {@code short}. | |
* | |
* @return the value if present | |
* @throws NoSuchElementException if no value is present | |
*/ | |
public short getAsShort() { | |
checkNarrowingNotLossy(value, Short.SIZE); | |
return (short) get(); | |
} | |
/** | |
* Get the returned value if present as a {@code int}. | |
* | |
* @return the value if present | |
* @throws NoSuchElementException if no value is present | |
*/ | |
public int getAsInt() { | |
checkNarrowingNotLossy(value, Integer.SIZE); | |
return (int) get(); | |
} | |
/** | |
* Get the returned value if present. | |
* | |
* @return the value if present | |
* @throws NoSuchElementException if no value is present | |
*/ | |
public long get() { | |
if (!isPresent) { | |
throw new NoSuchElementException("No value present"); | |
} | |
return value; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I couldn't find any prior art for a hash index that only stores values and used external equality. The default load factor of
0.5
provides excellent performance, but0.75
is a fine tradeoff for space, but will degrade significantly as you approach1.0
. If you're only storing positive values, you can double your value space by subtractingMIN_VALUE
for the type from your value.Tests for the class are available in a separate Gist.