Skip to content

Instantly share code, notes, and snippets.

@creationix
Last active March 9, 2020 21:32
Show Gist options
  • Save creationix/5d5578af69d78f99e9f7125ff20900e8 to your computer and use it in GitHub Desktop.
Save creationix/5d5578af69d78f99e9f7125ff20900e8 to your computer and use it in GitHub Desktop.
#include "main.h"
#include <assert.h>
#include <getopt.h>
#include <hydrogen.h>
#include <math.h>
#include <mpc.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <uv.h>
#include "storage.h"
static bool store(hex_storage_t* storage,
hex_hash_link_t* chain,
uint8_t depth,
uv_buf_t buffer,
size_t size) {
// Make sure we didn't try to store a file larger than 32 exabytes.
if (depth >= MAX_RECURSION) {
fprintf(stderr, "Maximum block hash chain recursion depth reached!\n");
return false;
}
// Get the address of the next hash entry and move the index.
uint8_t* hash = chain[depth].hashes + (HASH_SIZE * chain[depth].index++);
// Store the data and record the hash.
bool success =
hex_storage_save(storage, buffer, uv_buf_init(hash + 1, HASH_SIZE - 1));
// Record the recursion depth
hash[0] = depth;
// Increment the real size in the chain level
chain[depth].size += size;
return success;
}
static void dump_chain(hex_hash_link_t* chain) {
fprintf(stderr, "\033[1;35mCHAIN DUMP\033[0m\n");
for (int i = 0; i < MAX_RECURSION; i++) {
// if (chain[i].index) {
fprintf(stderr, "Level %i length=%d size=%ld\n", i, chain[i].index,
chain[i].size);
// for (int j = 0; j < chain[i].index; j++) {
// uint8_t* hash = &chain[i].hashes[j * HASH_SIZE];
// fprintf(stderr, "%d-", hash[0]);
// for (int i = 1; i < HASH_SIZE; i++) {
// fprintf(stderr, "%02x", hash[i]);
// }
// fprintf(stderr, "\n");
// }
// }
}
}
static int import(FILE* input, FILE* output, const char* base_path) {
hex_storage_t storage = {// uv_loop_t* loop;
uv_default_loop(),
// uv_buf_t base_path;
base_path};
uint8_t buffer[BLOCK_SIZE];
hex_hash_link_t chain[MAX_RECURSION];
memset(chain, 0, sizeof(chain));
while (!feof(input)) {
size_t n = fread(buffer, 1, BLOCK_SIZE, input);
if (!n) {
continue;
}
// Store the data chunk.
if (!store(&storage, chain, 0, uv_buf_init(buffer, n), n)) {
return false;
}
for (int i = 0;; i++) {
// If the level is full, we need to store it as a block and push to the
// next level.
if (chain[i].index == BLOCK_SIZE / HASH_SIZE) {
if (!store(&storage, chain, i + 1,
uv_buf_init(chain[i].hashes, sizeof(chain[i].hashes)),
chain[i].size)) {
return false;
}
chain[i].size = 0;
chain[i].index = 0;
} else {
break;
}
}
}
if (hex_verbose) {
dump_chain(chain);
}
uint8_t recursion = 0;
// Quickly scan for levels of recursion used.
for (int i = 0; i < MAX_RECURSION; i++) {
if (chain[i].size > 0) {
recursion = i;
}
}
// After reaching EOS, we need to flush all partial blocks.
for (int i = 0; i <= recursion; i++) {
if (chain[i].index > (i < recursion ? 0 : 1)) {
if (!store(&storage, chain, i + 1,
uv_buf_init(chain[i].hashes, HASH_SIZE * chain[i].index),
chain[i].size)) {
return false;
}
chain[i].size = 0;
chain[i].index = 0;
}
}
if (hex_verbose) {
dump_chain(chain);
}
uint8_t* final_hash = NULL;
size_t final_size = 0;
for (int i = 0; i < MAX_RECURSION; i++) {
if (chain[i].size > 0) {
final_hash = &chain[i].hashes[0];
final_size = chain[i].size;
}
}
if (!hex_quiet) {
fprintf(stderr, "Imported \033[1;32m%ld\033[0m bytes\n", final_size);
}
if (final_hash) {
for (int i = 0; i < HASH_SIZE; i++) {
fprintf(output, "%02x", final_hash[i]);
}
}
fprintf(output, "\n");
}
int main(int argc, char** argv) {
argv = uv_setup_args(argc, argv);
const char* name = "\033[1;32mimport\033[0m";
const char* description =
"Import file into object store and emit final hash.";
FILE* output = NULL;
FILE* input = NULL;
char input_path[PATH_MAX + 1];
char output_path[PATH_MAX + 1];
char objects_dir[PATH_MAX + 1];
objects_dir[0] = 0;
int c;
bool usage = false;
while ((c = getopt(argc, argv, "hqvs:i:o:")) != -1) {
switch (c) {
case 'h':
usage = true;
break;
case 'v':
hex_verbose = true;
break;
case 'q':
hex_quiet = true;
break;
case 's':
strncpy(objects_dir, optarg, PATH_MAX);
break;
case 'i':
snprintf(input_path, PATH_MAX, "'%s'", optarg);
input = fopen(optarg, "r");
if (!input) {
fprintf(stderr, "%s: '%s'\n", strerror(errno), optarg);
return -1;
}
break;
case 'o':
snprintf(output_path, PATH_MAX, "'%s'", optarg);
output = fopen(optarg, "w");
if (!output) {
fprintf(stderr, "%s: '%s'\n", strerror(errno), optarg);
return -1;
}
break;
case '?':
return 1;
default:
abort();
}
}
for (int i = optind; i < argc; i++) {
if (!input) {
input = fopen(argv[i], "r");
if (!input) {
fprintf(stderr, "%s: '%s'\n", strerror(errno), argv[i]);
return -1;
}
snprintf(input_path, PATH_MAX, "'%s'", argv[i]);
} else if (!output) {
output = fopen(argv[i], "w");
if (!output) {
fprintf(stderr, "%s: '%s'\n", strerror(errno), argv[i]);
return -1;
}
snprintf(output_path, PATH_MAX, "'%s'", argv[i]);
} else if (!*objects_dir) {
strncpy(objects_dir, argv[i], PATH_MAX);
} else {
fprintf(stderr, "Unexpected argument: '%s'\n", argv[i]);
usage = true;
}
}
if (!input) {
if (isatty(0)) {
usage = true;
fprintf(stderr, "Refusing to read from stdin TTY\n");
} else {
snprintf(input_path, PATH_MAX, "(stdin)");
input = stdin;
}
}
if (!output) {
// if (isatty(1)) {
// fprintf(stderr, "Refusing to write to stdout TTY\n");
// usage = true;
// } else {
snprintf(output_path, PATH_MAX, "(stdout)");
output = stdout;
// }
}
if (usage) {
fprintf(stderr, "hex.rocks - %s\n", description);
fprintf(stderr,
"Usage: %s\n -s path/to/objects\n -i input\n -o output\n "
"-h\n -v\n -q\n",
argv[0]);
return -1;
}
if (!hex_quiet) {
char hostname[PATH_MAX + 1];
gethostname(hostname, PATH_MAX);
char message[PATH_MAX * 5 + 1];
if (*objects_dir) {
snprintf(message, PATH_MAX * 5,
"\033[0;37m%s\033[0m: hex.rocks %s from \033[1;35m%s\033[0m to "
"\033[1;36m%s\033[0m using '\033[1;33m%s\033[0m'.\n",
hostname, name, input_path, output_path, objects_dir);
} else {
snprintf(message, PATH_MAX * 5,
"\033[0;37m%s\033[0m: hex.rocks %s from \033[1;35m%s\033[0m to "
"\033[1;36m%s\033[0m.\n",
hostname, name, input_path, output_path);
}
fprintf(stderr, "%s", message);
}
if (!*objects_dir) {
snprintf(objects_dir, PATH_MAX, "%s/.hex", getenv("HOME"));
}
if (hex_verbose)
fprintf(stderr, "Objects: '%s'\n", objects_dir);
int ret = import(input, output, objects_dir);
if (output != stdout)
fclose(output);
if (input != stdin)
fclose(input);
return ret;
}

input data is split into 64k chunks and

sparse data support?

data can be encoded using caify protocol which supports basic compression on large sparse FS images.

format allows recording up to 256 repitions of a hash, this means a large file full of zeroes uses 32 bytes per 16MB of blank space. A 1 Terrabyte blank file wil use 2MB of index. Do we want to allow general purpose compression instead? Without compression or repetition, a 1T blank file uses 512MB of uncompressed index, but it's highly repetitive.

Let's drop compression for now till a real use case arriges.

Recursive hashes

Ideally the hash is self decribing so that large files can recursively be encoded into a single hash. We can encode the level of recursion in the hash. This has several benefits

  • we can quickly find a rough estimate of the content size based on the hash alone. (0, 64K, 128M, 256G, 512T, 1E, ...)
  • This may be too rough, it would be great if we knew size with a power of two. We could still determine recursion based on this with some conventions. A single byte that's the log2 of the size holds maximum bounds up to 5.021681388309345e+58 Exabytes!
  • These sizes are much finer grained for estimates (0,1,2,4,8,16,32,64,128,256,512,1K,2K,4K,...)
  • the levels of recursion can be found by dividing the log2 of the size by 11 since each level compresses the data by a factor of 2^11 (32 bytes per hash, 64k per block)
  • the 31 byte gimli hash is still quite collision resistant and its hex format fits in a dns subdomain.
  • The recursion also helps with giant sparce spaces.

Smaller blocks

64k is nice for filling out blocks with 16 bit lengths, butit spills over if there is any overhead. Also the levels of recursion line up on 11 bit boundaries, but we're used to 10-bit boundaries because of metric system.

What if we use 32k blocks? The number of blocks for any given piece of data is only doubled, but the chances of fitting a message (including headers, signature, etc) into 16 bit length is much higher.

Also our recursion boundaries are nicer since they line up.

  • up to 1Mb - single level (most small files)
  • up to 1Gb - double level (most medium files and many large files)
  • up to 1Tb - triple level (large files)
  • up to 1Pb - quad level (entire hard-drive archives size)
  • ...

Movies that tend to straddle the 1Gb size will split into two different slots that helps with testing.

Revisiting sparse.

Using the uncompressed, but recursive format, we need the following space for various sized sparse files

  1. 64K of zeroes -> 32 byte hash (up to 64K empty)
  2. 64K of 2048 hashes (up to 128M empty)
  3. 64K of 2048 hashes (up to 256G empty)
  4. 64K of 2048 hashes (up to 512T empty)

So we would need 4 * 64K blocks (256K) + the 32 byte final hash to store a 1T empty file.

If we use the smaller block size, it's not quite as good, but still fine:

  1. 32k of zeroes -> 32 byte hash (up to 32K empty)
  2. 32K of 1024 hashes (up to 32M empty)
  3. 32K of 1024 hashes (up to 32G empty)
  4. 32K of 1024 hashes (up to 32T empty)

So we ended up needing 4 * 32K blocks (128K) for the 1T empty file!. Half the space and the same level of recursion.

I think it's safe to say that recursion handles the large sparce space problem well and the smaller block size actually helps in most cases.

Final design

  • 32 byte hashes, but first byte is reserved for a size bit, The byte is defined as byte = Math.ceil(Math.log2(size)) which inverted implies size > Math.pow(2, byte - 1) && size <= Math.pow(2, byte) (except for zero where it's all zeroes).
  • the remaining 31 bytes can be gimli permutation (hash) for now, we may add more metadata in the future.
  • the size byte gets smaller as you go down in recursion. This way a sub chunk of a large file is also a valid standalone hash of that chunk.
  • if sizeByte <= 15 there is no recursion.
  • if sizeByte <= 25 there is 1 level of recursion.
  • if sizeByte <= 35 there are 2 levels of recursion.
  • ...

In the recursive cases, the contents are interpreted as tightly packed hashes that need to be recursively expanded.

This design allows fairly efficient random access of very large files.

For streaming videos, this should make seeking and reading the end bytes very low latency operations.

Example

Let's assume a much simpler system to make it easier to visualize. hashes are 4 bytes with the first being size. Blocks are 16 bytes or 4 hashes each. (this is obviously going to have hash collisions in practice, but otherwise is sound)

  • no recursion holds up to 16 bytes
  • 1 level recursion holds up to 64 bytes
  • 2 levels recursion holds up to 256 bytes
  • ...

let's store the value [1,2,3,4,...,99,100]:

First chunk the first level of recursion...

[1...16] >- [4,...h1] [17...32] >- [4,...h2] [33...48] >- [4,...h3] [49...64] >- [4,...h4] [65...80] >- [4,...h5] [81...96] >- [4,...h6] [97...100] >- [2,...h7]

Notice that the last chunk is not full and we can see that in the size estimate byte. Let's move on to the next level of recursion...

[4,...h1,4,...h2,4,...h3,4,...h4] >- [6,...h8] [4,...h5,4,...h6,2,...h7] >- [5,...h9]

Now things are getting interesting, but since there is more than one hash, we need another level of recursion...

[6,...h8,5,...h9] >- [7,...h10]

Decoding this, we start with [7,...h10] and know that the full value is somewhere in size between 64 and 128 bytes. (The real value was 100 bytes)

To expand, we simply go in reverse, whenever the size bit is larger than 4 we know we need to recurse more. What's more, we can statically calculate the offsets in the final file to perform random access.

Suppose we want byte as offset 90, which should contain 91. First we load the root hash.

[7,...h9] -> [6,...h8,5,...h9]

This gives us two hashes, we know there is another level of recursion left, so they hold up to 64 bytes each. Since we want byte offset 90, we load the second hash only.

[5,...h9] -> [4,...h5,4,...h6,2,...h7]

We see these are the last level of recursion and hold up to 16 bytes each. They start at offsets 64, 80, and 96 so we grab the middle one.

[4,...h6] -> [81...96]

We now have the raw bytes starting at offset 80, we want offset 90 so we read at offset 10 and get 91!

We only needed 3 loads to grab an arbitrary byte. In a real data set 3 levels of recursion would be a file to to 32G in size and we would need to load 2 x 32 bytes to read a random byte! The real API will likely be reading ranges.

Getting exact size

Given the current design, we don't know the exact size of the value without actually loading the tail blocks. There is some extra precision in the size bit since it's range is ridiciously large, maybe we could get enough precision by scaling the log to not need the extra lookup.

This is problematic. We would need to change the formuls to be Math.ceil(Math.log2(i)*Math.pow(2, 15))) to get enough precision for 32k blocks, but that's already a 3 byte number for non-recursive! It might be better to just always fetch the final block(s) and count the length.

We could still shift the estimate to something reasonable. To start, what would a size of 255 currently mean?

It's up to size <= Math.pow(2, 255) or 5.021681388309345e+58 Exabytes!

If we change it to byte = Math.ceil(Math.log2(size)*4) and size <= Math.floor(Math.pow(2, byte / 4)) the max size quickly drops down to about 13.5 Exabytes.

What kind of extra precision does this give us? What are the new breakpoints?

byte == 0 -> size <= 1 byte == 4 -> size <= 2 byte == 7 -> size <= 3 byte == 8 -> size <= 4 byte == 10 -> size <= 5 byte == 11 -> size <= 6 byte == 12 -> size <= 8 byte == 13 -> size <= 9 byte == 14 -> size <= 11 byte == 15 -> size <= 13 byte == 16 -> size <= 16 byte == 17 -> size <= 19 byte == 18 -> size <= 22 byte == 19 -> size <= 26 byte == 20 -> size <= 32 byte == 21 -> size <= 38 byte == 22 -> size <= 45 byte == 23 -> size <= 53 byte == 24 -> size <= 64 byte == 25 -> size <= 76 byte == 26 -> size <= 90 byte == 27 -> size <= 107 byte == 28 -> size <= 128 byte == 30 -> size <= 181 byte == 40 -> size <= 1K byte == 50 -> size <= 5.66K byte == 60 -> size <= 32K

This gives us a lot more presision, almost 60 unique sizes for non recursive sizes alone! But it's ugly math. Is it worth it?

If we meet in the middle and divide by 2, we get:

byte == 0 -> size <= 1 byte == 2 -> size <= 2 byte == 4 -> size <= 4 byte == 5 -> size <= 5 byte == 6 -> size <= 8 byte == 7 -> size <= 11 byte == 8 -> size <= 16 byte == 9 -> size <= 22 byte == 10 -> size <= 32 byte == 11 -> size <= 45 byte == 12 -> size <= 64 byte == 13 -> size <= 90 byte == 14 -> size <= 128 byte == 15 -> size <= 181 byte == 16 -> size <= 256 byte == 17 -> size <= 362 byte == 18 -> size <= 512 byte == 19 -> size <= 724 byte == 20 -> size <= 1024 byte == 21 -> size <= 1448 byte == 22 -> size <= 2048 byte == 23 -> size <= 2896 byte == 24 -> size <= 4096 byte == 25 -> size <= 5792 byte == 26 -> size <= 8192 byte == 27 -> size <= 11585 byte == 28 -> size <= 16384 byte == 29 -> size <= 23170 byte == 30 -> size <= 32768

We get 30 levels per recursion level, twice of the 15 normally, but half the 60 in the divide by 4 case. The math is still ugly, but it is more precise.

I think I'll keep it the simple way unless there arises a reason to change it. This should look like something you've seen before:

byte == 0 -> size <= 1 byte == 1 -> size <= 2 byte == 2 -> size <= 4 byte == 3 -> size <= 8 byte == 4 -> size <= 16 byte == 5 -> size <= 32 byte == 6 -> size <= 64 byte == 7 -> size <= 128 byte == 8 -> size <= 256 byte == 9 -> size <= 512 byte == 10 -> size <= 1024 byte == 11 -> size <= 2048 byte == 12 -> size <= 4096 byte == 13 -> size <= 8192 byte == 14 -> size <= 16384 byte == 15 -> size <= 32768

Just record depth

Actually, recording the estimated size is neat, but it forces us to aways use full blocks for everything except leaves or we can't read back trees. Let's simply store the recursion depth.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment