Skip to content

Instantly share code, notes, and snippets.

@ypsu
Last active July 13, 2020 19:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ypsu/8e658a8721d3032dee44b9017681b0c6 to your computer and use it in GitHub Desktop.
Save ypsu/8e658a8721d3032dee44b9017681b0c6 to your computer and use it in GitHub Desktop.
// this is another mmap based topfew solution.
// context: https://www.tbray.org/ongoing/When/202x/2020/07/01/More-Topfew-Fun.
// other than the static hashmap, it doesn't use dynamic memory.
// rather than dynamically allocating strings for the hash keys,
// it just keeps pointers to the mmap'd data.
// it reparses the fields whenever it wants to compare two hash entries.
// compile like this: gcc -O2 topfew.c
// the following should be cmdline arguments
// but for the demo's simplicity let's just hardcode them:
// size of the hashtable in powers of two.
// the code doesn't support resizing, make sure the input fits.
enum { hashpower = 25 };
// input parameters and limits.
const char filename[] = "access_log";
char fieldspec[] = "1,3,5";
enum { topcnt = 10 };
const char separator = ' ';
#define _GNU_SOURCE
#include <assert.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#ifdef NDEBUG
#error "don't define NDEBUG, we need the assert."
#endif
#define check assert
// the hashtable is a open addressed, linearly probed.
struct hashbucket {
// the full hash value of this entry to speed up comparisons.
uint64_t hash;
// offset into g.data where the first field starts.
// start from here to this to get the full key.
int64_t offset;
// number of times this key occured.
// if this counter is 0, then this bucket is empty.
int64_t cnt;
};
enum { hashmask = (1ll << hashpower) - 1 };
// all global data lives here.
struct {
// an array of 2^hashpower buckets.
struct hashbucket *buckets;
int64_t bucketsused;
// a bitmask of the fields we need.
// e.g. 1,3,5 -> 2^0 + 2^2 + 2^4 = 1 + 4 + 16 = 21.
int64_t fieldmask;
// same as fieldmask but with the trailing zeroes removed.
// use this mask when you start from g.data[hashbucket.offset].
int64_t fieldmask1;
// pointer to the input file's mmap'd data.
const char *data;
// size of the input file.
int64_t datasize;
} g;
int main(void) {
// initialize the hash table.
check(0 < hashpower && hashpower < 60);
g.buckets = calloc(1ll << hashpower, sizeof(g.buckets[0]));
check(g.buckets != NULL);
// parse fieldspec into g.fieldmask.
char *token = strtok(fieldspec, ",");
do {
int field = -1;
check(sscanf(token, "%d", &field) == 1);
check(0 < field && field < 63);
g.fieldmask |= 1 << (field - 1);
} while ((token = strtok(NULL, ",")) != NULL);
g.fieldmask1 = g.fieldmask;
while ((g.fieldmask1 & 1) == 0) g.fieldmask1 >>= 1;
// mmap the input file.
int fd = open(filename, O_RDONLY);
check(fd != -1 && "couldn't open input file");
g.datasize = lseek(fd, 0, SEEK_END);
check(g.datasize > 0 && "input must be a non-empty, seekable file");
g.data = mmap(NULL, g.datasize, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
check(g.data != NULL);
// now go through the data and put the keys into the hash table.
// the body of this main loop is basically a state machine.
// calculate the hash value as read through the data,
// don't create separate strings from the keys.
// the hash function is based on djb2 with a little bit altered constant:
// h = (h * 129) ^ c.
const uint64_t inith = 5381;
// the running hash value.
uint64_t h = inith;
// the current field g.data[i] is at.
int field = 0;
// the offset where the current line's key started.
int64_t offset = (g.fieldmask & 1) == 1 ? 0 : -1;
for (int64_t i = 0; i < g.datasize; i++) {
if (g.data[i] == separator) {
field++;
if (((1ll << field) & g.fieldmask) != 0 && offset == -1) offset = i + 1;
continue;
}
if (g.data[i] != '\n') {
if (field > 63) continue;
if (((1ll << field) & g.fieldmask) == 0) continue;
h = (h * 129) ^ (uint8_t)g.data[i];
continue;
}
// we reached the end of the line.
// we add or update the current key in the hash table.
check(g.data[i] == '\n');
// ignore the line if it didn't contain enough fields.
if ((g.fieldmask >> (field + 1)) != 0) {
h = inith;
field = 0;
offset = (g.fieldmask & 1) == 1 ? i + 1 : -1;
continue;
}
check(offset != -1);
uint64_t bucketidx = (h - 1) & hashmask;
// find the first bucket that is either empty or is the same key.
while (true) {
bucketidx = (bucketidx + 1) & hashmask;
if (g.buckets[bucketidx].cnt == 0) break;
if (g.buckets[bucketidx].hash != h) continue;
// now we need to do a full a key comparison.
// this involves reparsing the fields of the lines.
int64_t a = g.buckets[bucketidx].offset;
int64_t b = offset;
int f = 0;
bool equal = true;
while (equal && (g.fieldmask1 >> f) != 0) {
bool aspace = g.data[a] == separator || g.data[a] == '\n';
bool bspace = g.data[b] == separator || g.data[b] == '\n';
if (aspace || bspace) {
if (!aspace || !bspace) {
equal = false;
break;
}
a++;
b++;
f++;
if (((g.fieldmask1 >> f) & 1) == 0) {
// this is unimportant field.
// skip until the next separator.
while (g.data[a] != separator && g.data[a] != '\n') a++;
while (g.data[b] != separator && g.data[b] != '\n') b++;
}
continue;
}
equal = (g.data[a++] == g.data[b++]);
}
if (equal) break;
};
if (g.buckets[bucketidx].cnt == 0) {
if (g.bucketsused++ > hashmask / 2) {
fprintf(stderr, "table too small, increase the hashpower parameter!");
exit(1);
}
}
g.buckets[bucketidx].hash = h;
g.buckets[bucketidx].offset = offset;
g.buckets[bucketidx].cnt++;
h = inith;
field = 0;
offset = (g.fieldmask & 1) == 1 ? i + 1 : -1;
}
// go through the buckets and find the topcnt most frequent buckets.
struct {
int64_t cnt;
int64_t offset;
} top[topcnt] = {};
int lastidx = topcnt - 1;
for (int64_t i = 0; i <= hashmask; i++) {
int64_t cnt = g.buckets[i].cnt;
if (cnt <= top[lastidx].cnt) continue;
int insertpos = 0;
while (top[insertpos].cnt > cnt) insertpos++;
check(insertpos < topcnt);
if (insertpos + 1 < topcnt) {
int sz = (topcnt - insertpos - 1) * sizeof(top[0]);
memmove(top + insertpos + 1, top + insertpos, sz);
}
top[insertpos].cnt = cnt;
top[insertpos].offset = g.buckets[i].offset;
}
// and now just print the result.
for (int i = 0; i < topcnt; i++) {
if (top[i].cnt == 0) continue;
printf("%lld", (long long)top[i].cnt);
int64_t o = top[i].offset;
int f = 0;
bool hadspace = false;
for (o = top[i].offset; (g.fieldmask1 >> f) != 0; o++) {
if (g.data[o] == separator || g.data[o] == '\n') {
f++;
hadspace = false;
continue;
}
if (((g.fieldmask1 >> f) & 1) == 0) continue;
if (!hadspace) {
hadspace = true;
putchar(separator);
}
putchar(g.data[o]);
}
putchar('\n');
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment