Last active
July 13, 2020 19:41
-
-
Save ypsu/8e658a8721d3032dee44b9017681b0c6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// this is another mmap based topfew solution. | |
// context: https://www.tbray.org/ongoing/When/202x/2020/07/01/More-Topfew-Fun. | |
// other than the static hashmap, it doesn't use dynamic memory. | |
// rather than dynamically allocating strings for the hash keys, | |
// it just keeps pointers to the mmap'd data. | |
// it reparses the fields whenever it wants to compare two hash entries. | |
// compile like this: gcc -O2 topfew.c | |
// the following should be cmdline arguments | |
// but for the demo's simplicity let's just hardcode them: | |
// size of the hashtable in powers of two. | |
// the code doesn't support resizing, make sure the input fits. | |
enum { hashpower = 25 }; | |
// input parameters and limits. | |
const char filename[] = "access_log"; | |
char fieldspec[] = "1,3,5"; | |
enum { topcnt = 10 }; | |
const char separator = ' '; | |
#define _GNU_SOURCE | |
#include <assert.h> | |
#include <fcntl.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/mman.h> | |
#include <unistd.h> | |
#ifdef NDEBUG | |
#error "don't define NDEBUG, we need the assert." | |
#endif | |
#define check assert | |
// the hashtable is a open addressed, linearly probed. | |
struct hashbucket { | |
// the full hash value of this entry to speed up comparisons. | |
uint64_t hash; | |
// offset into g.data where the first field starts. | |
// start from here to this to get the full key. | |
int64_t offset; | |
// number of times this key occured. | |
// if this counter is 0, then this bucket is empty. | |
int64_t cnt; | |
}; | |
enum { hashmask = (1ll << hashpower) - 1 }; | |
// all global data lives here. | |
struct { | |
// an array of 2^hashpower buckets. | |
struct hashbucket *buckets; | |
int64_t bucketsused; | |
// a bitmask of the fields we need. | |
// e.g. 1,3,5 -> 2^0 + 2^2 + 2^4 = 1 + 4 + 16 = 21. | |
int64_t fieldmask; | |
// same as fieldmask but with the trailing zeroes removed. | |
// use this mask when you start from g.data[hashbucket.offset]. | |
int64_t fieldmask1; | |
// pointer to the input file's mmap'd data. | |
const char *data; | |
// size of the input file. | |
int64_t datasize; | |
} g; | |
int main(void) { | |
// initialize the hash table. | |
check(0 < hashpower && hashpower < 60); | |
g.buckets = calloc(1ll << hashpower, sizeof(g.buckets[0])); | |
check(g.buckets != NULL); | |
// parse fieldspec into g.fieldmask. | |
char *token = strtok(fieldspec, ","); | |
do { | |
int field = -1; | |
check(sscanf(token, "%d", &field) == 1); | |
check(0 < field && field < 63); | |
g.fieldmask |= 1 << (field - 1); | |
} while ((token = strtok(NULL, ",")) != NULL); | |
g.fieldmask1 = g.fieldmask; | |
while ((g.fieldmask1 & 1) == 0) g.fieldmask1 >>= 1; | |
// mmap the input file. | |
int fd = open(filename, O_RDONLY); | |
check(fd != -1 && "couldn't open input file"); | |
g.datasize = lseek(fd, 0, SEEK_END); | |
check(g.datasize > 0 && "input must be a non-empty, seekable file"); | |
g.data = mmap(NULL, g.datasize, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0); | |
check(g.data != NULL); | |
// now go through the data and put the keys into the hash table. | |
// the body of this main loop is basically a state machine. | |
// calculate the hash value as read through the data, | |
// don't create separate strings from the keys. | |
// the hash function is based on djb2 with a little bit altered constant: | |
// h = (h * 129) ^ c. | |
const uint64_t inith = 5381; | |
// the running hash value. | |
uint64_t h = inith; | |
// the current field g.data[i] is at. | |
int field = 0; | |
// the offset where the current line's key started. | |
int64_t offset = (g.fieldmask & 1) == 1 ? 0 : -1; | |
for (int64_t i = 0; i < g.datasize; i++) { | |
if (g.data[i] == separator) { | |
field++; | |
if (((1ll << field) & g.fieldmask) != 0 && offset == -1) offset = i + 1; | |
continue; | |
} | |
if (g.data[i] != '\n') { | |
if (field > 63) continue; | |
if (((1ll << field) & g.fieldmask) == 0) continue; | |
h = (h * 129) ^ (uint8_t)g.data[i]; | |
continue; | |
} | |
// we reached the end of the line. | |
// we add or update the current key in the hash table. | |
check(g.data[i] == '\n'); | |
// ignore the line if it didn't contain enough fields. | |
if ((g.fieldmask >> (field + 1)) != 0) { | |
h = inith; | |
field = 0; | |
offset = (g.fieldmask & 1) == 1 ? i + 1 : -1; | |
continue; | |
} | |
check(offset != -1); | |
uint64_t bucketidx = (h - 1) & hashmask; | |
// find the first bucket that is either empty or is the same key. | |
while (true) { | |
bucketidx = (bucketidx + 1) & hashmask; | |
if (g.buckets[bucketidx].cnt == 0) break; | |
if (g.buckets[bucketidx].hash != h) continue; | |
// now we need to do a full a key comparison. | |
// this involves reparsing the fields of the lines. | |
int64_t a = g.buckets[bucketidx].offset; | |
int64_t b = offset; | |
int f = 0; | |
bool equal = true; | |
while (equal && (g.fieldmask1 >> f) != 0) { | |
bool aspace = g.data[a] == separator || g.data[a] == '\n'; | |
bool bspace = g.data[b] == separator || g.data[b] == '\n'; | |
if (aspace || bspace) { | |
if (!aspace || !bspace) { | |
equal = false; | |
break; | |
} | |
a++; | |
b++; | |
f++; | |
if (((g.fieldmask1 >> f) & 1) == 0) { | |
// this is unimportant field. | |
// skip until the next separator. | |
while (g.data[a] != separator && g.data[a] != '\n') a++; | |
while (g.data[b] != separator && g.data[b] != '\n') b++; | |
} | |
continue; | |
} | |
equal = (g.data[a++] == g.data[b++]); | |
} | |
if (equal) break; | |
}; | |
if (g.buckets[bucketidx].cnt == 0) { | |
if (g.bucketsused++ > hashmask / 2) { | |
fprintf(stderr, "table too small, increase the hashpower parameter!"); | |
exit(1); | |
} | |
} | |
g.buckets[bucketidx].hash = h; | |
g.buckets[bucketidx].offset = offset; | |
g.buckets[bucketidx].cnt++; | |
h = inith; | |
field = 0; | |
offset = (g.fieldmask & 1) == 1 ? i + 1 : -1; | |
} | |
// go through the buckets and find the topcnt most frequent buckets. | |
struct { | |
int64_t cnt; | |
int64_t offset; | |
} top[topcnt] = {}; | |
int lastidx = topcnt - 1; | |
for (int64_t i = 0; i <= hashmask; i++) { | |
int64_t cnt = g.buckets[i].cnt; | |
if (cnt <= top[lastidx].cnt) continue; | |
int insertpos = 0; | |
while (top[insertpos].cnt > cnt) insertpos++; | |
check(insertpos < topcnt); | |
if (insertpos + 1 < topcnt) { | |
int sz = (topcnt - insertpos - 1) * sizeof(top[0]); | |
memmove(top + insertpos + 1, top + insertpos, sz); | |
} | |
top[insertpos].cnt = cnt; | |
top[insertpos].offset = g.buckets[i].offset; | |
} | |
// and now just print the result. | |
for (int i = 0; i < topcnt; i++) { | |
if (top[i].cnt == 0) continue; | |
printf("%lld", (long long)top[i].cnt); | |
int64_t o = top[i].offset; | |
int f = 0; | |
bool hadspace = false; | |
for (o = top[i].offset; (g.fieldmask1 >> f) != 0; o++) { | |
if (g.data[o] == separator || g.data[o] == '\n') { | |
f++; | |
hadspace = false; | |
continue; | |
} | |
if (((g.fieldmask1 >> f) & 1) == 0) continue; | |
if (!hadspace) { | |
hadspace = true; | |
putchar(separator); | |
} | |
putchar(g.data[o]); | |
} | |
putchar('\n'); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment