activity stream @vocab extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Find the @vocab element in an activity stream | |
// Ref: https://mccue.dev/pages/12-3-22-practical-advent | |
#include <string.h> | |
struct buf { | |
char *buf; | |
int len, off; | |
}; | |
struct tok { | |
enum {TOK_EOF, TOK_OBJ, TOK_ARR, TOK_END, TOK_STR, TOK_ATOM} type; | |
int beg, end; | |
}; | |
// Return the next byte, or -1 on EOF. | |
static int get(struct buf *b) | |
{ | |
return b->off<b->len ? b->buf[b->off++]&255 : -1; | |
} | |
static struct tok next(struct buf *b) | |
{ | |
struct tok tok = {TOK_EOF, 0, 0}; | |
for (;;) { | |
int c = get(b); | |
if (c < 0) { | |
return tok; | |
} else if (c > ' ') { | |
switch (c) { | |
case ',': | |
case ':': continue; | |
case '{': tok.type = TOK_OBJ; | |
return tok; | |
case '[': tok.type = TOK_ARR; | |
return tok; | |
case ']': | |
case '}': tok.type = TOK_END; | |
return tok; | |
case '"': tok.type = TOK_STR; | |
tok.beg = b->off; | |
for (int skip = 0;;) { | |
c = get(b); | |
switch (c) { | |
case -1: tok.type = TOK_EOF; | |
return tok; | |
case '\\': skip = 1; | |
break; | |
case '"' : if (skip) { | |
default : skip = 0; | |
break; | |
} | |
tok.end = b->off - 1; | |
return tok; // TODO: decode in place | |
} | |
} | |
default : tok.type = TOK_ATOM; // number, bool, null | |
tok.beg = b->off - 1; | |
for (;;) { | |
switch (get(b)) { | |
default : b->off--; // unget | |
// fallthrough | |
case -1: tok.end = b->off; | |
return tok; | |
case '+': case '-': case '.': case '0': case '1': | |
case '2': case '3': case '4': case '5': case '6': | |
case '7': case '8': case '9': case 'a': case 'e': | |
case 'f': case 'l': case 'n': case 'r': case 's': | |
case 't': case 'u': break; | |
} | |
} | |
} | |
} | |
} | |
} | |
// Iterate through the current object for a specific key. | |
static int findkey(struct buf *b, char *key, int len) | |
{ | |
for (int depth = 0;;) { | |
struct tok tok = next(b); | |
if (depth) { | |
// Skip tokens until nesting ends | |
switch (tok.type) { | |
case TOK_EOF : return 0; | |
case TOK_OBJ : | |
case TOK_ARR : depth++; | |
break; | |
case TOK_END : depth--; | |
break; | |
case TOK_ATOM: | |
case TOK_STR : break; | |
} | |
continue; | |
} | |
if (tok.type != TOK_STR) { | |
return 0; | |
} | |
int n = tok.end - tok.beg; | |
if (!depth && n==len && !memcmp(b->buf+tok.beg, key, len)) { | |
return 1; | |
} | |
tok = next(b); | |
switch (tok.type) { | |
case TOK_EOF : return 0; | |
case TOK_OBJ : | |
case TOK_ARR : depth++; | |
break; | |
case TOK_END : return 0; | |
case TOK_ATOM: | |
case TOK_STR : break; | |
} | |
} | |
} | |
// Find the @vocab in the JSON buffer, setting the length and returning | |
// the string's address. | |
static char *vocab(char *buf, int *len) | |
{ | |
struct buf b[1] = {{buf, *len, 0}}; | |
*len = 0; | |
struct tok tok = next(b); | |
if (tok.type != TOK_OBJ) { | |
return 0; | |
} | |
if (!findkey(b, "@context", 8)) { | |
return 0; | |
} | |
tok = next(b); | |
switch (tok.type) { | |
case TOK_OBJ: if (findkey(b, "@vocab", 6)) { | |
case TOK_ARR: tok = next(b); | |
case TOK_STR: break; | |
} // fallthrough | |
default : return 0; | |
} | |
if (tok.type != TOK_STR) { | |
return 0; | |
} | |
*len = tok.end - tok.beg; | |
return buf + tok.beg; | |
} | |
#ifndef DEBUG | |
// Test | |
// $ cc -o vocab vocab.c | |
// $ ./vocab <input.json | |
#include <stdio.h> | |
int main(void) | |
{ | |
char buf[1<<12]; | |
int len = fread(buf, 1, sizeof(buf), stdin); | |
char *v = vocab(buf, &len); | |
if (!v) { | |
return 1; | |
} | |
fwrite(v, len, 1, stdout); | |
putchar('\n'); | |
fflush(stdout); | |
return ferror(stdout) || ferror(stdin); | |
} | |
#else // defined(DEBUG) | |
#include <stdio.h> | |
// Useful for debugging the lexer | |
int main(void) | |
{ | |
char buf[1<<12]; | |
int len = fread(buf, 1, sizeof(buf), stdin); | |
struct buf b[1] = {{buf, len, 0}}; | |
for (;;) { | |
struct tok tok = next(b); | |
switch (tok.type) { | |
case TOK_EOF : puts("EOF"); return 0; | |
case TOK_OBJ : puts("OBJ"); break; | |
case TOK_ARR : puts("ARR"); break; | |
case TOK_END : puts("END"); break; | |
case TOK_STR : printf("STR\t\"%.*s\"\n", tok.end-tok.beg, buf+tok.beg); | |
break; | |
case TOK_ATOM: printf("ATOM\t%.*s\n", tok.end-tok.beg, buf+tok.beg); | |
} | |
} | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment