Created
August 13, 2019 18:20
-
-
Save hkulekci/9a3239341d90b1bd45d75f38aa51de0d to your computer and use it in GitHub Desktop.
Elasticsearch UUID Generator Part
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//******************************************************************* | |
// Welcome to CompileJava! | |
// If you experience any issues, please contact us ('More Info') --> | |
// Sorry that the "Paste" feature no longer works! GitHub broke it. | |
//******************************************************************* | |
import java.lang.Math; // headers MUST be above the first class | |
/* | |
* Licensed to Elasticsearch under one or more contributor | |
* license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright | |
* ownership. Elasticsearch licenses this file to you under | |
* the Apache License, Version 2.0 (the "License"); you may | |
* not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
import java.util.Base64; | |
import java.util.concurrent.atomic.AtomicInteger; | |
import java.security.SecureRandom; | |
import java.net.NetworkInterface; | |
import java.net.SocketException; | |
import java.util.Enumeration; | |
// one class needs to have a main() method | |
public class HelloWorld | |
{ | |
// arguments are passed using the text field below this editor | |
public static void main(String[] args) | |
{ | |
TimeBasedUUIDGenerator a = new TimeBasedUUIDGenerator(); | |
System.out.println(a.getBase64UUID()); | |
} | |
} | |
interface UUIDGenerator { | |
String getBase64UUID(); | |
} | |
/** | |
* These are essentially flake ids but we use 6 (not 8) bytes for timestamp, and use 3 (not 2) bytes for sequence number. We also reorder | |
* bytes in a way that does not make ids sort in order anymore, but is more friendly to the way that the Lucene terms dictionary is | |
* structured. | |
* For more information about flake ids, check out | |
* https://archive.fo/2015.07.08-082503/http://www.boundary.com/blog/2012/01/flake-a-decentralized-k-ordered-unique-id-generator-in-erlang/ | |
*/ | |
class TimeBasedUUIDGenerator implements UUIDGenerator { | |
// We only use bottom 3 bytes for the sequence number. Paranoia: init with random int so that if JVM/OS/machine goes down, clock slips | |
// backwards, and JVM comes back up, we are less likely to be on the same sequenceNumber at the same time: | |
private final AtomicInteger sequenceNumber = new AtomicInteger(SecureRandomHolder.INSTANCE.nextInt()); | |
// Used to ensure clock moves forward: | |
private long lastTimestamp; | |
private static final byte[] SECURE_MUNGED_ADDRESS = MacAddressProvider.getSecureMungedAddress(); | |
static { | |
assert SECURE_MUNGED_ADDRESS.length == 6; | |
} | |
// protected for testing | |
protected long currentTimeMillis() { | |
return System.currentTimeMillis(); | |
} | |
// protected for testing | |
protected byte[] macAddress() { | |
return SECURE_MUNGED_ADDRESS; | |
} | |
@Override | |
public String getBase64UUID() { | |
final int sequenceId = sequenceNumber.incrementAndGet() & 0xffffff; | |
long timestamp = currentTimeMillis(); | |
synchronized (this) { | |
// Don't let timestamp go backwards, at least "on our watch" (while this JVM is running). We are still vulnerable if we are | |
// shut down, clock goes backwards, and we restart... for this we randomize the sequenceNumber on init to decrease chance of | |
// collision: | |
timestamp = Math.max(lastTimestamp, timestamp); | |
if (sequenceId == 0) { | |
// Always force the clock to increment whenever sequence number is 0, in case we have a long time-slip backwards: | |
timestamp++; | |
} | |
lastTimestamp = timestamp; | |
} | |
final byte[] uuidBytes = new byte[15]; | |
int i = 0; | |
// We have auto-generated ids, which are usually used for append-only workloads. | |
// So we try to optimize the order of bytes for indexing speed (by having quite | |
// unique bytes close to the beginning of the ids so that sorting is fast) and | |
// compression (by making sure we share common prefixes between enough ids), | |
// but not necessarily for lookup speed (by having the leading bytes identify | |
// segments whenever possible) | |
// Blocks in the block tree have between 25 and 48 terms. So all prefixes that | |
// are shared by ~30 terms should be well compressed. I first tried putting the | |
// two lower bytes of the sequence id in the beginning of the id, but compression | |
// is only triggered when you have at least 30*2^16 ~= 2M documents in a segment, | |
// which is already quite large. So instead, we are putting the 1st and 3rd byte | |
// of the sequence number so that compression starts to be triggered with smaller | |
// segment sizes and still gives pretty good indexing speed. We use the sequenceId | |
// rather than the timestamp because the distribution of the timestamp depends too | |
// much on the indexing rate, so it is less reliable. | |
uuidBytes[i++] = (byte) sequenceId; | |
// changes every 65k docs, so potentially every second if you have a steady indexing rate | |
uuidBytes[i++] = (byte) (sequenceId >>> 16); | |
// Now we start focusing on compression and put bytes that should not change too often. | |
uuidBytes[i++] = (byte) (timestamp >>> 16); // changes every ~65 secs | |
uuidBytes[i++] = (byte) (timestamp >>> 24); // changes every ~4.5h | |
uuidBytes[i++] = (byte) (timestamp >>> 32); // changes every ~50 days | |
uuidBytes[i++] = (byte) (timestamp >>> 40); // changes every 35 years | |
byte[] macAddress = macAddress(); | |
assert macAddress.length == 6; | |
System.arraycopy(macAddress, 0, uuidBytes, i, macAddress.length); | |
i += macAddress.length; | |
// Finally we put the remaining bytes, which will likely not be compressed at all. | |
uuidBytes[i++] = (byte) (timestamp >>> 8); | |
uuidBytes[i++] = (byte) (sequenceId >>> 8); | |
uuidBytes[i++] = (byte) timestamp; | |
assert i == uuidBytes.length; | |
return Base64.getUrlEncoder().withoutPadding().encodeToString(uuidBytes); | |
} | |
} | |
class MacAddressProvider { | |
private static byte[] getMacAddress() throws SocketException { | |
Enumeration<NetworkInterface> en = NetworkInterface.getNetworkInterfaces(); | |
if (en != null) { | |
while (en.hasMoreElements()) { | |
NetworkInterface nint = en.nextElement(); | |
if (!nint.isLoopback()) { | |
// Pick the first valid non loopback address we find | |
byte[] address = nint.getHardwareAddress(); | |
if (isValidAddress(address)) { | |
return address; | |
} | |
} | |
} | |
} | |
// Could not find a mac address | |
return null; | |
} | |
private static boolean isValidAddress(byte[] address) { | |
if (address == null || address.length != 6) { | |
return false; | |
} | |
for (byte b : address) { | |
if (b != 0x00) { | |
return true; // If any of the bytes are non zero assume a good address | |
} | |
} | |
return false; | |
} | |
public static byte[] getSecureMungedAddress() { | |
byte[] address = null; | |
try { | |
address = getMacAddress(); | |
} catch (SocketException e) { | |
// address will be set below | |
} | |
if (!isValidAddress(address)) { | |
address = constructDummyMulticastAddress(); | |
} | |
byte[] mungedBytes = new byte[6]; | |
SecureRandomHolder.INSTANCE.nextBytes(mungedBytes); | |
for (int i = 0; i < 6; ++i) { | |
mungedBytes[i] ^= address[i]; | |
} | |
return mungedBytes; | |
} | |
private static byte[] constructDummyMulticastAddress() { | |
byte[] dummy = new byte[6]; | |
SecureRandomHolder.INSTANCE.nextBytes(dummy); | |
/* | |
* Set the broadcast bit to indicate this is not a _real_ mac address | |
*/ | |
dummy[0] |= (byte) 0x01; | |
return dummy; | |
} | |
} | |
class SecureRandomHolder { | |
// class loading is atomic - this is a lazy & safe singleton to be used by this package | |
public static final SecureRandom INSTANCE = new SecureRandom(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment