Skip to content

Instantly share code, notes, and snippets.

@skeeto
Last active December 4, 2022 19:00
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save skeeto/1fc12a2c973317df24daf842cd86cc33 to your computer and use it in GitHub Desktop.
activity stream @vocab extractor
// Find the @vocab element in an activity stream
// Ref: https://mccue.dev/pages/12-3-22-practical-advent
#include <string.h>
struct buf {
char *buf;
int len, off;
};
struct tok {
enum {TOK_EOF, TOK_OBJ, TOK_ARR, TOK_END, TOK_STR, TOK_ATOM} type;
int beg, end;
};
// Return the next byte, or -1 on EOF.
static int get(struct buf *b)
{
return b->off<b->len ? b->buf[b->off++]&255 : -1;
}
static struct tok next(struct buf *b)
{
struct tok tok = {TOK_EOF, 0, 0};
for (;;) {
int c = get(b);
if (c < 0) {
return tok;
} else if (c > ' ') {
switch (c) {
case ',':
case ':': continue;
case '{': tok.type = TOK_OBJ;
return tok;
case '[': tok.type = TOK_ARR;
return tok;
case ']':
case '}': tok.type = TOK_END;
return tok;
case '"': tok.type = TOK_STR;
tok.beg = b->off;
for (int skip = 0;;) {
c = get(b);
switch (c) {
case -1: tok.type = TOK_EOF;
return tok;
case '\\': skip = 1;
break;
case '"' : if (skip) {
default : skip = 0;
break;
}
tok.end = b->off - 1;
return tok; // TODO: decode in place
}
}
default : tok.type = TOK_ATOM; // number, bool, null
tok.beg = b->off - 1;
for (;;) {
switch (get(b)) {
default : b->off--; // unget
// fallthrough
case -1: tok.end = b->off;
return tok;
case '+': case '-': case '.': case '0': case '1':
case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9': case 'a': case 'e':
case 'f': case 'l': case 'n': case 'r': case 's':
case 't': case 'u': break;
}
}
}
}
}
}
// Iterate through the current object for a specific key.
static int findkey(struct buf *b, char *key, int len)
{
for (int depth = 0;;) {
struct tok tok = next(b);
if (depth) {
// Skip tokens until nesting ends
switch (tok.type) {
case TOK_EOF : return 0;
case TOK_OBJ :
case TOK_ARR : depth++;
break;
case TOK_END : depth--;
break;
case TOK_ATOM:
case TOK_STR : break;
}
continue;
}
if (tok.type != TOK_STR) {
return 0;
}
int n = tok.end - tok.beg;
if (!depth && n==len && !memcmp(b->buf+tok.beg, key, len)) {
return 1;
}
tok = next(b);
switch (tok.type) {
case TOK_EOF : return 0;
case TOK_OBJ :
case TOK_ARR : depth++;
break;
case TOK_END : return 0;
case TOK_ATOM:
case TOK_STR : break;
}
}
}
// Find the @vocab in the JSON buffer, setting the length and returning
// the string's address.
static char *vocab(char *buf, int *len)
{
struct buf b[1] = {{buf, *len, 0}};
*len = 0;
struct tok tok = next(b);
if (tok.type != TOK_OBJ) {
return 0;
}
if (!findkey(b, "@context", 8)) {
return 0;
}
tok = next(b);
switch (tok.type) {
case TOK_OBJ: if (findkey(b, "@vocab", 6)) {
case TOK_ARR: tok = next(b);
case TOK_STR: break;
} // fallthrough
default : return 0;
}
if (tok.type != TOK_STR) {
return 0;
}
*len = tok.end - tok.beg;
return buf + tok.beg;
}
#ifndef DEBUG
// Test
// $ cc -o vocab vocab.c
// $ ./vocab <input.json
#include <stdio.h>
int main(void)
{
char buf[1<<12];
int len = fread(buf, 1, sizeof(buf), stdin);
char *v = vocab(buf, &len);
if (!v) {
return 1;
}
fwrite(v, len, 1, stdout);
putchar('\n');
fflush(stdout);
return ferror(stdout) || ferror(stdin);
}
#else // defined(DEBUG)
#include <stdio.h>
// Useful for debugging the lexer
int main(void)
{
char buf[1<<12];
int len = fread(buf, 1, sizeof(buf), stdin);
struct buf b[1] = {{buf, len, 0}};
for (;;) {
struct tok tok = next(b);
switch (tok.type) {
case TOK_EOF : puts("EOF"); return 0;
case TOK_OBJ : puts("OBJ"); break;
case TOK_ARR : puts("ARR"); break;
case TOK_END : puts("END"); break;
case TOK_STR : printf("STR\t\"%.*s\"\n", tok.end-tok.beg, buf+tok.beg);
break;
case TOK_ATOM: printf("ATOM\t%.*s\n", tok.end-tok.beg, buf+tok.beg);
}
}
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment