skeeto/vocab.c

## vocab.c
// Find the @vocab element in an activity stream
// Ref: https://mccue.dev/pages/12-3-22-practical-advent
#include <string.h>

struct buf {
    char *buf;
    int len, off;
};

struct tok {
    enum {TOK_EOF, TOK_OBJ, TOK_ARR, TOK_END, TOK_STR, TOK_ATOM} type;
    int beg, end;
};

// Return the next byte, or -1 on EOF.
static int get(struct buf *b)
{
    return b->off<b->len ? b->buf[b->off++]&255 : -1;
}

static struct tok next(struct buf *b)
{
    struct tok tok = {TOK_EOF, 0, 0};
    for (;;) {
        int c = get(b);
        if (c < 0) {
            return tok;
        } else if (c > ' ') {
            switch (c) {
            case ',':
            case ':': continue;
            case '{': tok.type = TOK_OBJ;
                      return tok;
            case '[': tok.type = TOK_ARR;
                      return tok;
            case ']':
            case '}': tok.type = TOK_END;
                      return tok;
            case '"': tok.type = TOK_STR;
                      tok.beg = b->off;
                      for (int skip = 0;;) {
                          c = get(b);
                          switch (c) {
                          case   -1: tok.type = TOK_EOF;
                                     return tok;
                          case '\\': skip = 1;
                                     break;
                          case '"' : if (skip) {
                          default  :     skip = 0;
                                         break;
                                     }
                                     tok.end = b->off - 1;
                                     return tok; // TODO: decode in place
                          }
                      }
            default : tok.type = TOK_ATOM; // number, bool, null
                      tok.beg = b->off - 1;
                      for (;;) {
                          switch (get(b)) {
                          default : b->off--;  // unget
                                    // fallthrough
                          case  -1: tok.end = b->off;
                                    return tok;
                          case '+': case '-': case '.': case '0': case '1':
                          case '2': case '3': case '4': case '5': case '6':
                          case '7': case '8': case '9': case 'a': case 'e':
                          case 'f': case 'l': case 'n': case 'r': case 's':
                          case 't': case 'u': break;
                          }
                      }
            }
        }
    }
}

// Iterate through the current object for a specific key.
static int findkey(struct buf *b, char *key, int len)
{
    for (int depth = 0;;) {
        struct tok tok = next(b);

        if (depth) {
            // Skip tokens until nesting ends
            switch (tok.type) {
            case TOK_EOF : return 0;
            case TOK_OBJ :
            case TOK_ARR : depth++;
                           break;
            case TOK_END : depth--;
                           break;
            case TOK_ATOM:
            case TOK_STR : break;
            }
            continue;
        }

        if (tok.type != TOK_STR) {
            return 0;
        }
        int n = tok.end - tok.beg;
        if (!depth && n==len && !memcmp(b->buf+tok.beg, key, len)) {
            return 1;
        }

        tok = next(b);
        switch (tok.type) {
        case TOK_EOF : return 0;
        case TOK_OBJ :
        case TOK_ARR : depth++;
                       break;
        case TOK_END : return 0;
        case TOK_ATOM:
        case TOK_STR : break;
        }
    }
}

// Find the @vocab in the JSON buffer, setting the length and returning
// the string's address.
static char *vocab(char *buf, int *len)
{
    struct buf b[1] = {{buf, *len, 0}};
    *len = 0;

    struct tok tok = next(b);
    if (tok.type != TOK_OBJ) {
        return 0;
    }

    if (!findkey(b, "@context", 8)) {
        return 0;
    }

    tok = next(b);
    switch (tok.type) {
    case TOK_OBJ: if (findkey(b, "@vocab", 6)) {
    case TOK_ARR:     tok = next(b);
    case TOK_STR:     break;
                  } // fallthrough
    default     : return 0;
    }

    if (tok.type != TOK_STR) {
        return 0;
    }
    *len = tok.end - tok.beg;
    return buf + tok.beg;
}


#ifndef DEBUG
// Test
//   $ cc -o vocab vocab.c
//   $ ./vocab <input.json
#include <stdio.h>

int main(void)
{
    char buf[1<<12];
    int len = fread(buf, 1, sizeof(buf), stdin);
    char *v = vocab(buf, &len);
    if (!v) {
        return 1;
    }
    fwrite(v, len, 1, stdout);
    putchar('\n');
    fflush(stdout);
    return ferror(stdout) || ferror(stdin);
}

#else // defined(DEBUG)
#include <stdio.h>
// Useful for debugging the lexer

int main(void)
{
    char buf[1<<12];
    int len = fread(buf, 1, sizeof(buf), stdin);
    struct buf b[1] = {{buf, len, 0}};
    for (;;) {
        struct tok tok = next(b);
        switch (tok.type) {
        case TOK_EOF : puts("EOF"); return 0;
        case TOK_OBJ : puts("OBJ"); break;
        case TOK_ARR : puts("ARR"); break;
        case TOK_END : puts("END"); break;
        case TOK_STR : printf("STR\t\"%.*s\"\n", tok.end-tok.beg, buf+tok.beg);
                       break;
        case TOK_ATOM: printf("ATOM\t%.*s\n", tok.end-tok.beg, buf+tok.beg);
        }
    }
}
#endif
	// Find the @vocab element in an activity stream
	// Ref: https://mccue.dev/pages/12-3-22-practical-advent
	#include <string.h>

	struct buf {
	char *buf;
	int len, off;
	};

	struct tok {
	enum {TOK_EOF, TOK_OBJ, TOK_ARR, TOK_END, TOK_STR, TOK_ATOM} type;
	int beg, end;
	};

	// Return the next byte, or -1 on EOF.
	static int get(struct buf *b)
	{
	return b->off<b->len ? b->buf[b->off++]&255 : -1;
	}

	static struct tok next(struct buf *b)
	{
	struct tok tok = {TOK_EOF, 0, 0};
	for (;;) {
	int c = get(b);
	if (c < 0) {
	return tok;
	} else if (c > ' ') {
	switch (c) {
	case ',':
	case ':': continue;
	case '{': tok.type = TOK_OBJ;
	return tok;
	case '[': tok.type = TOK_ARR;
	return tok;
	case ']':
	case '}': tok.type = TOK_END;
	return tok;
	case '"': tok.type = TOK_STR;
	tok.beg = b->off;
	for (int skip = 0;;) {
	c = get(b);
	switch (c) {
	case -1: tok.type = TOK_EOF;
	return tok;
	case '\\': skip = 1;
	break;
	case '"' : if (skip) {
	default : skip = 0;
	break;
	}
	tok.end = b->off - 1;
	return tok; // TODO: decode in place
	}
	}
	default : tok.type = TOK_ATOM; // number, bool, null
	tok.beg = b->off - 1;
	for (;;) {
	switch (get(b)) {
	default : b->off--; // unget
	// fallthrough
	case -1: tok.end = b->off;
	return tok;
	case '+': case '-': case '.': case '0': case '1':
	case '2': case '3': case '4': case '5': case '6':
	case '7': case '8': case '9': case 'a': case 'e':
	case 'f': case 'l': case 'n': case 'r': case 's':
	case 't': case 'u': break;
	}
	}
	}
	}
	}
	}

	// Iterate through the current object for a specific key.
	static int findkey(struct buf b, char key, int len)
	{
	for (int depth = 0;;) {
	struct tok tok = next(b);

	if (depth) {
	// Skip tokens until nesting ends
	switch (tok.type) {
	case TOK_EOF : return 0;
	case TOK_OBJ :
	case TOK_ARR : depth++;
	break;
	case TOK_END : depth--;
	break;
	case TOK_ATOM:
	case TOK_STR : break;
	}
	continue;
	}

	if (tok.type != TOK_STR) {
	return 0;
	}
	int n = tok.end - tok.beg;
	if (!depth && n==len && !memcmp(b->buf+tok.beg, key, len)) {
	return 1;
	}

	tok = next(b);
	switch (tok.type) {
	case TOK_EOF : return 0;
	case TOK_OBJ :
	case TOK_ARR : depth++;
	break;
	case TOK_END : return 0;
	case TOK_ATOM:
	case TOK_STR : break;
	}
	}
	}

	// Find the @vocab in the JSON buffer, setting the length and returning
	// the string's address.
	static char vocab(char buf, int *len)
	{
	struct buf b[1] = {{buf, *len, 0}};
	*len = 0;

	struct tok tok = next(b);
	if (tok.type != TOK_OBJ) {
	return 0;
	}

	if (!findkey(b, "@context", 8)) {
	return 0;
	}

	tok = next(b);
	switch (tok.type) {
	case TOK_OBJ: if (findkey(b, "@vocab", 6)) {
	case TOK_ARR: tok = next(b);
	case TOK_STR: break;
	} // fallthrough
	default : return 0;
	}

	if (tok.type != TOK_STR) {
	return 0;
	}
	*len = tok.end - tok.beg;
	return buf + tok.beg;
	}


	#ifndef DEBUG
	// Test
	// $ cc -o vocab vocab.c
	// $ ./vocab <input.json
	#include <stdio.h>

	int main(void)
	{
	char buf[1<<12];
	int len = fread(buf, 1, sizeof(buf), stdin);
	char *v = vocab(buf, &len);
	if (!v) {
	return 1;
	}
	fwrite(v, len, 1, stdout);
	putchar('\n');
	fflush(stdout);
	return ferror(stdout) \|\| ferror(stdin);
	}

	#else // defined(DEBUG)
	#include <stdio.h>
	// Useful for debugging the lexer

	int main(void)
	{
	char buf[1<<12];
	int len = fread(buf, 1, sizeof(buf), stdin);
	struct buf b[1] = {{buf, len, 0}};
	for (;;) {
	struct tok tok = next(b);
	switch (tok.type) {
	case TOK_EOF : puts("EOF"); return 0;
	case TOK_OBJ : puts("OBJ"); break;
	case TOK_ARR : puts("ARR"); break;
	case TOK_END : puts("END"); break;
	case TOK_STR : printf("STR\t\"%.*s\"\n", tok.end-tok.beg, buf+tok.beg);
	break;
	case TOK_ATOM: printf("ATOM\t%.*s\n", tok.end-tok.beg, buf+tok.beg);
	}
	}
	}
	#endif