Skip to content

Instantly share code, notes, and snippets.

@lelanthran
Last active January 12, 2024 15:22
Show Gist options
  • Save lelanthran/896a2d1e228d345ecea66a5b279aea24 to your computer and use it in GitHub Desktop.
Save lelanthran/896a2d1e228d345ecea66a5b279aea24 to your computer and use it in GitHub Desktop.
A simplistic tool to search HTML files.
/* ********************************************************
* Copyright ©2024 Rundata Systems. All rights reserved.
* This project is licensed under the GPLv3 License. You
* can find a copy of this license at:
* https://www.gnu.org/licenses/gpl-3.0.en.html
*/
#warning TODO: Split this into multiple files
#warning INCOMPLETE: Implement searching using compiled query
/* ********************************************************
* I call this program from shell scripts that scrape web-pages.
*
* It pairs quite nicely with curl: fetch a URL with curl and
* search it with htmlq.
*/
/* ********************************************************
* Compiled with:
* gcc -W -Wall -Wextra -g htmlq.c -o htmlq
*
* The easiest way to execute the compile command above is by
* copying and pasting it into the command-line.
*/
// Standard headers
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <inttypes.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#define FPRINTF(f,...) do {\
fprintf (f, "%s:%i in %s(): ", __FILE__, __LINE__, __func__);\
fprintf (f, __VA_ARGS__);\
} while (0);
#define VERSION "0.0.1"
/* ********************************************************
* util.c module
*/
static bool tprintf (char **dst, const char *fmts, ...)
{
va_list ap, ap_copy;
va_start (ap, fmts);
va_copy (ap_copy, ap);
size_t curlen = *dst ? strlen (*dst) : 0;
size_t newlen = vsnprintf (NULL, 0, fmts, ap_copy);
char *tmp = realloc (*dst, curlen + newlen + 10);
if (!tmp) {
FPRINTF (stderr, "OOM error reallocating formatted string\n");
return false;
}
*dst = tmp;
vsprintf (&tmp[curlen], fmts, ap);
va_end (ap);
return true;
}
static char *sstrdup (const char *src)
{
if (!src)
return NULL;
char *ret = NULL;
size_t nbytes = strlen (src) + 1;
if (!(ret = malloc (nbytes))) {
FPRINTF (stderr, "OOM error allocating new string from [%s]\n", src);
return NULL;
}
return strcpy (ret, src);
}
size_t strdiff (const char **lhs, const char **rhs)
{
size_t count = 0;
while (lhs && rhs && *lhs && *rhs && **rhs && **lhs && **lhs == **rhs) {
(*lhs)++;
(*rhs)++;
count++;
}
if (**lhs == **rhs) {
return 0;
}
return count;
}
static bool sstricmp (const char *lhs, const char *rhs)
{
if (!lhs || !rhs)
return false;
size_t len1 = strlen (lhs),
len2 = strlen (rhs);
if (len1 != len2)
return false;
for (size_t i=0; i<len1; i++) {
if ((tolower (lhs[i]) != tolower (rhs[i])))
return false;
}
return true;
}
static bool is_space (int c)
{
return isspace (c) ? true : false;
}
static bool not_space (int c)
{
return isspace (c) || c == 0 ? false : true;
}
static char *file_slurp (const char *fname, size_t *len)
{
bool error = true;
char *ret = NULL;
FILE *inf = NULL;
if (!(inf = fopen (fname, "r"))) {
FPRINTF (stderr, "Failed to open [%s] for reading: %m\n", fname);
goto cleanup;
}
if ((fseek (inf, 0, SEEK_END)) != 0) {
FPRINTF (stderr, "Error setting file position: %m\n");
goto cleanup;
}
long flen = ftell (inf);
if (!(ret = calloc (flen + 1, 1))) {
FPRINTF (stderr, "OOM error allocating buffer for file\n");
goto cleanup;
}
if ((fseek (inf, 0, SEEK_SET)) != 0) {
FPRINTF (stderr, "Error setting file position: %m\n");
goto cleanup;
}
size_t nbytes = fread (ret, 1, flen, inf);
if (nbytes != (size_t)flen) {
FPRINTF (stderr, "Unexpected number of bytes read in [%zu vs %li]: %m\n",
nbytes, flen);
goto cleanup;
}
if (len) {
*len = nbytes;
}
error = false;
cleanup:
if (inf) {
fclose (inf);
}
if (error) {
free (ret);
ret = NULL;
}
return ret;
}
/* ********************************************************
* list.c module
* A datatype for arrays of *things* ... with a deallocate
* function compatible with `free`.
*
* The list is strictly ordered. The order stored and returned is
* exactly the same as the order of `append` calls.
*/
typedef struct {
void **items;
size_t nitems;
void (*dealloc) (void *);
} list_t;
static void list_free (list_t **list)
{
if (!list || !*list)
return;
for (size_t i=0; (*list)->dealloc && i < (*list)->nitems; i++) {
(*list)->dealloc ((void *)(*list)->items[i]);
}
free ((void *)(*list)->items);
free (*list);
*list = NULL;
}
static list_t *list_new (void (*dealloc) (void *))
{
list_t *ret = calloc (1, sizeof *ret);
if (!ret) {
FPRINTF (stderr, "OOM error allocating list_t\n");
return NULL;
}
ret->dealloc = dealloc;
return ret;
}
static const void *list_append (list_t *list, const void *item)
{
void **tmp = realloc (list->items, (list->nitems + 1) * sizeof *list->items);
if (!tmp) {
FPRINTF (stderr, "OOM error reallocating list_t array\n");
return NULL;
}
list->items = tmp;
list->items[list->nitems++] = (void *)item;
return item;
}
/* ********************************************************
* node.c module
* A datatype for trees of nodes. There is no difference
* between a `tree` and a `node` - they're the same thing.
*
* A node stores an item from an HTML page, and is created from
* parsing literal HTML passed to the `_new()` function.
*
* Children of a node have a strict ordering, and this is the
* order that recursion on the structure is guaranteed.
*/
enum node_type_t {
node_type_NODE,
node_type_STRING,
};
typedef struct node_t node_t;
struct node_t {
enum node_type_t type;
char *tag;
node_t *parent;
list_t *keys; // char *, keys[i] maps to values [i]
list_t *values; // char *, values[i] maps to keys [i]
list_t *children; // node_t *, Children, stored in order of appearance.
};
char *node_ancestry (char **dst, node_t *node)
{
char *ret = NULL;
node_t *tmp = node;
size_t nbytes = 0;
while (tmp) {
nbytes += strlen (tmp->tag) + strlen ("] --> [") + 3;
tmp = tmp->parent;
}
nbytes++;
if (!(ret = calloc (nbytes, 1))) {
FPRINTF (stderr, "OOM error allocating space for ancestry\n");
return NULL;
}
tmp = node;
size_t index = 0;
const char *delim = "";
while (tmp) {
sprintf (&ret[index], "%s[%s]", delim, tmp->tag);
delim = " --> ";
index += strlen (&ret[index]);
tmp = tmp->parent;
}
free (*dst);
*dst = ret;
return ret;
}
static void print_node (node_t *node, size_t depth, void *outf)
{
FILE *f = outf;
if (!f)
f = stdout;
if (node->type == node_type_STRING) {
fprintf (f, "%s ", node->tag);
return;
}
#define INDENT for (size_t i=0; i<depth; i++) { fprintf (f, " "); }
INDENT;
fprintf (f, "<%s ", node->tag);
for (size_t i=0; i<node->keys->nitems; i++) {
fprintf (f, "%s='%s' ",
(char *)node->keys->items[i], (char *)node->values->items[i]);
}
fprintf (f, ">\n");
}
static void node_del (node_t *node)
{
if (!node)
return;
free (node->tag);
list_free (&node->keys);
list_free (&node->values);
list_free (&node->children);
free (node);
}
static bool node_add_child (node_t *parent, node_t *child)
{
if (!parent)
return true;
if (!child)
return false;
if (parent->type != node_type_NODE) {
FPRINTF (stderr, "Attempt to append child onto a data elementis invalid\n");
return false;
}
child->parent = parent;
if (!(list_append (parent->children, child))) {
FPRINTF (stderr, "Failed to attach child to parent\n");
return false;
}
return true;
}
static bool node_add_attr (node_t *node, char *attr)
{
char *key = attr;
char *value = strchr (key, '=');
if (!value) {
value = "";
} else {
*value++ = 0;
}
if (value[0] == '\'' || value[0] == '"') {
*value++ = 0;
value[strlen (value) - 1] = 0;
}
const char *akey, *avalue;
if (!(akey = list_append (node->keys, sstrdup (key)))
|| !(avalue = list_append (node->values, sstrdup (value)))) {
FPRINTF (stderr, "Failed to store [%s=%s] for node [%s]\n",
key, value, node->tag);
return false;
}
return true;
}
node_t *node_new (node_t *parent, enum node_type_t type, const char *data)
{
bool error = true;
node_t *ret = calloc (1, sizeof *ret);
if (!ret) {
FPRINTF (stderr, "OOM error allocating node [%s]\n", data);
goto cleanup;
}
if (!(node_add_child (parent, ret))) {
FPRINTF (stderr, "Failed to add child to parent [%s]\n", data);
goto cleanup;
}
ret->type = type;
if (!(ret->tag = sstrdup (data))) {
FPRINTF (stderr, "Failed to allocate tag [%s]", data);
goto cleanup;
}
if (type == node_type_NODE) {
ret->keys = list_new (free);
ret->values = list_new (free);
ret->children = list_new ((void (*) (void *))node_del);
if (!ret->keys || !ret->values || !ret->children) {
FPRINTF (stderr, "Failed to allocate fields [keys:values:children] "
"[%p:%p:%p]\n", ret->keys, ret->values, ret->children);
goto cleanup;
}
}
error = false;
cleanup:
if (error) {
node_del (ret);
ret = NULL;
}
return ret;
}
static void node_visit (struct node_t *node, size_t depth, void *parg,
void (*fptr) (struct node_t *, size_t, void *))
{
if (!node)
return;
fptr (node, depth, parg);
if (node->type == node_type_STRING)
return;
for (size_t i=0; i < node->children->nitems; i++) {
node_visit (node->children->items[i], depth + 1, parg, fptr);
}
}
/* *************************************************************************
* token.c module
* The token datatype, and some operations for it.
*/
enum token_type_t {
token_END,
token_TAGOPEN,
token_TAGCLOSE,
token_IGNOPEN,
token_IGNCLOSE,
token_TEXT,
token_KP,
token_GT,
token_SELFCLOSING,
};
static const char *token_type_name (enum token_type_t type)
{
static const struct {
enum token_type_t t;
const char *s;
} types[] = {
#define TOKEN_NAME(x) { x, #x }
TOKEN_NAME(token_END),
TOKEN_NAME(token_TAGOPEN),
TOKEN_NAME(token_TAGCLOSE),
TOKEN_NAME(token_IGNOPEN),
TOKEN_NAME(token_IGNCLOSE),
TOKEN_NAME(token_TEXT),
TOKEN_NAME(token_KP),
TOKEN_NAME(token_GT),
TOKEN_NAME(token_SELFCLOSING),
};
#undef TOKEN_NAME
static size_t ntypes = sizeof types/sizeof types[0];
for (size_t i=0; i<ntypes; i++) {
if (types[i].t == type)
return types[i].s;
}
static char unknown[55];
snprintf (unknown, sizeof unknown, "Unknown token type %i\n", type);
return unknown;
}
struct token_t {
enum token_type_t type;
char *text;
};
static struct token_t *token_new (enum token_type_t type, char *start, char *end)
{
struct token_t *ret = calloc (1, sizeof *ret);
if (!ret) {
FPRINTF (stderr, "OOM error allocating token_t\n");
return NULL;
}
if (!start || !end) {
ret->text = sstrdup ("");
ret->type = type;
return ret;
}
size_t slen = end - start;
if (!(ret->text = calloc (slen + 1, 1))) {
FPRINTF (stderr, "OOM error allocating token->text\n");
free (ret);
return NULL;
}
memcpy (ret->text, start, slen);
ret->type = type;
return ret;
}
static void token_del (struct token_t *token)
{
if (!token)
return;
free (token->text);
free (token);
}
/* *************************************************************************
* The actual tokeniser.
*/
static void token_set_ignoretag (struct token_t *token)
{
static const char *items[] = {
"SCRIPT", "STYLE",
};
static const size_t nitems = sizeof items/sizeof items[0];
if (!token)
return;
for (size_t i=0; i < nitems; i++) {
if ((sstricmp (items[i], token->text))) {
if (token->type == token_TAGOPEN)
token->type = token_IGNOPEN;
if (token->type == token_TAGCLOSE)
token->type = token_IGNCLOSE;
break;
}
}
}
static struct token_t *read_token_tag (char *in, size_t *idx)
{
(*idx)++;
char *start = &in[*idx];;
while ((not_space(in[*idx])) && in[*idx] != '>') {
(*idx)++;
}
if ((&in[*idx] - start) < 1) {
FPRINTF (stderr, "Empty tag found, aborting\n");
return NULL;
}
if (in[(*idx) - 1] == '/') {
(*idx)--;
}
enum token_type_t tagtype = token_TAGOPEN;
if (start[0] == '/') {
tagtype = token_TAGCLOSE;
start++;
while ((is_space (in[*idx]))) {
in[*idx] = 0;
(*idx)++;
}
in[*idx] = 0;
(*idx)++;
}
struct token_t *ret = token_new (tagtype, start, &in[*idx]);
token_set_ignoretag (ret);
return ret;
}
static bool read_ffwd_quote (char *in, size_t *idx)
{
int quotec = in[*idx];
while (1) {
(*idx)++;
int c = in[*idx];
if (c == '\\') {
(*idx)++;
if (in[*idx] == 0)
return false;
(*idx)++;
continue;
}
if (c == 0)
return false;
if (c == quotec) {
(*idx)++;
return true;
}
}
return false;
}
static struct token_t *read_token_char (enum token_type_t type,
char *in, size_t *idx)
{
(*idx)++;
return token_new (type, &in[(*idx) - 1], &in[*idx]);
}
static struct token_t *read_token_text (char *in, size_t *idx)
{
static const char *delims = "<=>";
char *start = &in[*idx];
while ((not_space (in[*idx])) && !(strchr (delims, in[*idx]))) {
(*idx)++;
}
if ((is_space (in[*idx]))) {
return token_new (token_TEXT, start, &in[*idx]);
}
if (in[*idx] == '=') {
(*idx)++;
if (in[*idx] == '"' || in[*idx] == '\'') {
if (!(read_ffwd_quote (in, idx))) {
return NULL;
}
return token_new (token_KP, start, &in[*idx]);
}
while ((not_space (in[*idx]))
&& in[*idx] != '>'
&& in[*idx] != '/') {
(*idx)++;
}
return token_new (token_TEXT, start, &in[*idx]);
}
return token_new (token_TEXT, start, &in[*idx]);
}
static struct token_t *read_token_selfclose (char *in, size_t *idx)
{
char *start = &in[*idx];
char *end = &in[(*idx) + 1];
while ((is_space (*end))) {
end++;
}
if (*end != '>') {
return read_token_text (in, idx);
}
(*idx) += (end - start) + 1;
return token_new (token_SELFCLOSING, start, &in[*idx]);
}
static char *parse_string (char *in, size_t *idx)
{
(*idx)++;
char *start = &in[*idx];
char qc = *(start - 1);
while ((*idx)++) {
int c = in[*idx];
if (c == 0) {
FPRINTF (stderr, "Unterminated string near [%s]\n", start);
return NULL;
}
if (c == '\\') {
(*idx)++;
continue;
}
if (c == qc) {
break;
}
}
in[*idx] = 0;
(*idx)++;
return start;
}
static struct token_t *read_token_string (char *in, size_t *idx)
{
char *s = parse_string (in, idx);
char *e = &s[strlen(s)];
return token_new (token_TEXT, s, e);
}
static struct token_t *next_token (char *in, size_t *idx)
{
// Swallow whitespace
while (is_space (in[*idx]))
(*idx)++;
// The first character indicates what we are dealing with.
switch (in[*idx]) {
case 0:
return token_new (token_END, NULL, NULL);
case '<': return read_token_tag (in, idx);
case '>': return read_token_char (token_GT, in, idx);
case '/': return read_token_selfclose (in, idx);
case '`': // Fallthrough
case '\'': // Fallthrough
case '"': return read_token_string (in, idx);
default: return read_token_text (in, idx);
}
return NULL;
}
// This is a very quick-n-dirty tokeniser/parser combination. Designing
// a nice one is involved, as each character has to be examined in order
// to keep track of character positions within a line and line positions
// within the input.
//
// While a "nice" tokeniser/parser is easy, it's also tedious. The trade-
// off is, unfortunately, poor error reporting when parsing (because
// no character position or line number information is available).
//
// In the absence of line number and character positions, we can still
// report "error near 'foo bar blaz ...'" and let the user perform a
// search for that string.
//
enum recurse_action_t {
recurse_action_SUCCESS = 1,
recurse_action_ERROR,
recurse_action_SKIP,
};
static enum recurse_action_t node_read_recurse (const char *rootname, bool preamble,
node_t *parent, char *input,
size_t *index, int *nerrors)
{
int nerrs = 0;
struct token_t *token = NULL;
enum recurse_action_t action;
char *ancestors = NULL;
/* Parsing HTML into a tree is stupidly tedious. Here are every
* type of token we will ever see:
* END: Dummy token returned on end-of-input
* TAGOPEN: Token of a tag. Strips out '<' and '>'
* TAGCLOSE: Token of a /tag. Strips out "</" and '>'
* IGNOPEN: Token of a tag we will ignore. Strips out '<' and '>'
* IGNCLOSE: Token of a /tag we will ignore. Strips out "</" and '>'
* TEXT: Normal content encountered
* KP: keypair value (either k=v, k='v' or k="v")
* GT: The '>' character
* SELFCLOSING: The "/>" two-character string.
*
* Here's the actions we will take for each token encountered:
*
* END:
* We end processing immediately, signalling error if parent is not
* named "<root>"
* TAGOPEN:
* If preamble, signal error and return
* If !preamble, create a newnode, call recurse(newnode)
* TAGCLOSE:
* if preamble, signal error and return
* If !preamble
* If token->text != parent->tag
* warn, then return success
* else
* return success
* IGNOPEN:
* If preamble, signal error and return
* If !preamble, read and discard tokens until IGNCLOSE
* IGNCLOSE:
* signal error and return
* TEXT and KP
* If preamble we attach token as attr to parent
* If !preamble we attach token as text/content to parent
* GT:
* If preamble
* If token->text does not start with with a '!'
* we set preamble to false
* else
* we inform caller to SKIP
* If !preamble we signal error and return
* SELFCLOSING:
* If preamble we return success
* if !preamble we signal error and return
*/
while ((token = next_token (input, index))
&& token->type != token_END) {
node_t *newnode = NULL;
FPRINTF (stdout, "<%s> (%i) %s[%s]\n",
parent->tag, preamble, token_type_name (token->type), token->text);
if (preamble) {
switch (token->type) {
case token_END:
FPRINTF (stderr, "%s: Encountered EOF within attrs of [%s]\n",
node_ancestry(&ancestors, parent), parent->tag);
nerrs++;
goto cleanup;
case token_TAGOPEN: // Fallthrough
case token_IGNOPEN:
FPRINTF (stderr, "%s: Encountered tag [%s] within attrs of [%s]\n",
node_ancestry (&ancestors, parent),
token->text, parent->tag);
nerrs++;
goto cleanup;
case token_TAGCLOSE: // Fallthrough
case token_IGNCLOSE:
FPRINTF (stderr, "%s: Encountered '</%s>' within attrs of [%s]\n",
node_ancestry (&ancestors, parent),
token->text, parent->tag);
nerrs++;
goto cleanup;
case token_TEXT: // Fallthrough
case token_KP:
if (!(node_add_attr (parent, token->text))) {
FPRINTF (stderr, "%s: Failed to add attr [%s] to [%s]\n",
node_ancestry (&ancestors, parent),
token->text, parent->tag);
nerrs++;
goto cleanup;
}
break;
case token_GT:
if (parent->tag[0] == '!') {
nerrs = -1;
goto cleanup;
} else {
preamble = false;
}
break;
case token_SELFCLOSING:
goto cleanup;
}
} else {
switch (token->type) {
case token_END:
if ((strcmp (parent->tag, "root")) != 0) {
FPRINTF (stderr, "%s: Encountered EOF while processing [%s]\n",
node_ancestry (&ancestors, parent),
parent->tag);
nerrs++;
}
goto cleanup;
case token_TAGOPEN:
if (!(newnode = node_new (parent, node_type_NODE, token->text))) {
FPRINTF (stderr, "%s: Failed to create new node [%s]\n",
node_ancestry (&ancestors, parent),
token->text);
nerrs++;
goto cleanup;
}
action = node_read_recurse (rootname, true,
newnode, input, index, nerrors);
if (action == recurse_action_ERROR) {
FPRINTF (stderr, "%s: Failed to read node [%s]\n",
node_ancestry (&ancestors, parent),
newnode->tag);
nerrs++;
goto cleanup;
}
if (action == recurse_action_SKIP) {
}
break;
case token_TAGCLOSE:
if ((strcmp (token->text, parent->tag)) != 0) {
FPRINTF (stderr, "WARNING: %s: Expected </%s>, got </%s>\n",
node_ancestry (&ancestors, parent),
token->text, parent->tag);
}
goto cleanup;
case token_IGNOPEN:
token_del (token);
while ((token = (next_token (input, index)))) {
if (token->type == token_END) {
FPRINTF (stderr, "%s: Unexpected end of input\n",
node_ancestry (&ancestors, parent));
nerrs++;
goto cleanup;
}
FPRINTF (stdout, "TAG_IGNOPEN: <%s> (%i) %s[%s]\n",
parent->tag,
preamble,
token_type_name (token->type), token->text);
if (token->type == token_IGNCLOSE) {
break;
}
token_del (token);
token = NULL;
}
if (!token) {
FPRINTF (stderr, "%s: Unexpected end of input looking for "
"ignoretag",
node_ancestry (&ancestors, parent));
nerrs++;
goto cleanup;
}
break;
case token_IGNCLOSE:
FPRINTF (stderr, "%s: unexpected </%s>\n",
node_ancestry (&ancestors, parent), token->text);
nerrs++;
goto cleanup;
case token_TEXT: // Fallthrough
case token_KP:
if (!(newnode = node_new (parent, node_type_STRING, token->text))) {
FPRINTF (stderr, "%s: Failed to create new text [%s]\n",
node_ancestry (&ancestors, parent),
token->text);
nerrs++;
goto cleanup;
}
break;
case token_GT: // Fallthrough
case token_SELFCLOSING:
FPRINTF (stderr, "%s: Unexpected [%s]\n",
node_ancestry (&ancestors, parent),
token->text);
nerrs++;
goto cleanup;
}
}
token_del (token);
}
if (nerrors) {
*nerrors = *nerrors + nerrs;
}
cleanup:
free (ancestors);
token_del (token);
if (nerrs > 0) {
return recurse_action_ERROR;
}
if (nerrs < 0) {
return recurse_action_SKIP;
}
return recurse_action_SUCCESS;
}
static node_t *node_read (const char *fname, int *nerrors)
{
int nerrs = 0;
node_t *root = NULL;
size_t input_len = 0;
char *input = file_slurp (fname, &input_len);
if (!input) {
FPRINTF (stderr, "Failed to read file into memory, aborting file read\n");
nerrs++;
goto cleanup;
}
size_t index = 0;
if (!(root = node_new (NULL, node_type_NODE, fname))) {
FPRINTF (stderr, "Failed to create a root node for [%s]\n", fname);
goto cleanup;
}
enum recurse_action_t action = recurse_action_SKIP;
while ((action = node_read_recurse (fname, false, root, input, &index, &nerrs))
== recurse_action_SKIP) {
;
}
if (action == recurse_action_ERROR) {
FPRINTF (stderr, "Failure parsing file [%s]\n", fname);
nerrs++;
goto cleanup;
}
node_visit (root, 0, stdout, print_node);
cleanup:
if (nerrs) {
node_del (root);
root = NULL;
}
if (nerrors) {
*nerrors = nerrs;
}
free (input);
return root;
}
/* ********************************************************
* selector.c module
* Selector functions.
*
* The goal is to locate, in the input HTML, specific data, and extract
* that data in plain text form.
*
* While it would be nice to eventually support any expression supported
* by `querySelectorAll()` due to existing webdev familiarity, that's a
* large (and probably painful) undertaking.
*
* Selectors should look like this:
* "div > id^='-test' ~ .myClass"
*
*/
enum selector_type_t {
selector_END = -2,
selector_ERROR = -1,
selector_OPERAND = 1,
selector_OPERATOR,
selector_STRING,
};
const char *selector_type_name (enum selector_type_t type)
{
static const struct {
enum selector_type_t type;
const char *name;
} names[] = {
#define SEL(x) { x, #x }
SEL (selector_END),
SEL (selector_ERROR),
SEL (selector_OPERAND),
SEL (selector_OPERATOR),
SEL (selector_STRING),
#undef SEL
};
static const size_t nnames = sizeof names/sizeof names[0];
for (size_t i=0; i<nnames; i++) {
if (type == names[i].type) {
return names[i].name;
}
}
static char unknown[55];
snprintf (unknown, sizeof unknown, "Unknown selector type: %i", type);
return unknown;
}
struct selector_t {
/* **********************************************
* A selector_t element is one of _END, _OPERAND
* or _OPERATOR
*/
enum selector_type_t type;
union {
/* **********************************************
* Attribute we want to match. There are some
* that are reserved for internal use and cannot
* be used by the HTML we are processing.
* 1. \x02tagname
* 2. \x02content
* In general any string starting with \x02 is
* reserved for internal use. HTML pages using
* tags, classnames or attribute names that
* start with \x02 will have incorrect search
* results.
*/
char *_operand;
/* **********************************************
* Operators:
* , Logical OR
* ~ Any sibling
* + Adjacent sibling
* > Direct child
* $ Wildcard for end of string
* ^ Wildcard for beginning of string
* * Wildcard for 'anywhere in string'
* = Match exact
* \x03 Descendent
*/
int _operator;
/* **********************************************
* Store strings that would get matched against
*/
char *_string;
} u;
};
static void selector_dump (struct selector_t *s, char **dst)
{
if (!s) {
FPRINTF (stderr, "Got NULL selector_t object\n");
return;
}
tprintf (dst, "[%s:", selector_type_name (s->type));
switch (s->type) {
case selector_END:
case selector_ERROR: tprintf (dst, "..."); break;
case selector_OPERAND: tprintf (dst, "%s", s->u._operand); break;
case selector_STRING: tprintf (dst, "%s", s->u._string); break;
case selector_OPERATOR: tprintf (dst, "%c", s->u._operator); break;
}
tprintf (dst, "]\n");
}
static void selector_del (struct selector_t *sel)
{
if (!sel)
return;
if (sel->type == selector_OPERAND) {
free (sel->u._operand);
}
if (sel->type == selector_STRING) {
free (sel->u._string);
}
free (sel);
}
static struct selector_t *selector_new (enum selector_type_t type,
int operator,
const char *operand)
{
struct selector_t *ret = calloc (1, sizeof *ret);
if (!ret) {
FPRINTF (stderr, "OOM error allocating selector_t object [%s]\n", operand);
return NULL;
}
ret->type = type;
switch (ret->type) {
case selector_OPERATOR:
ret->u._operator = operator;
break;
case selector_STRING:
case selector_OPERAND:
if (!(ret->u._string = sstrdup (operand))) {
FPRINTF (stderr, "OOM error allocating operand field [%s]\n", operand);
free (ret);
return NULL;
}
break;
case selector_ERROR:
case selector_END:
break;
}
return ret;
}
static char *delimited_string (char *src, size_t *index, const char *delims)
{
char *start = &src[*index];
char *end = start;
while (*end && (!(strchr (delims, *end)))) {
end++;
}
*end = 0;
size_t slen = end - start;
(*index) += slen;
return start;
}
#define SOPERATORS ".#[,~>+$*^="
#define SDELIMS "'\" \t\n\r" SOPERATORS
static enum selector_type_t _selector_read_attr (list_t *dst,
const char *attr,
char *s, size_t *index)
{
if (!(list_append (dst, selector_new (selector_OPERAND, 0, attr)))
|| !(list_append (dst, selector_new (selector_OPERATOR, '=', NULL)))
|| !(list_append (dst,
selector_new (selector_STRING, 0,
delimited_string (s, index, SDELIMS))))) {
FPRINTF (stderr, "Failed to read direct attr ref [%s]\n", attr);
return false;
}
return true;
}
static enum selector_type_t selector_read_class (list_t *dst,
char *s, size_t *index)
{
// e.g. ".myClassName"
(*index)++;
return _selector_read_attr (dst, "class", s, index);
}
static enum selector_type_t selector_read_id (list_t *dst,
char *s, size_t *index)
{
// e.g. "#myId"
(*index)++;
return _selector_read_attr(dst, "id", s, index);
}
static enum selector_type_t selector_read_attrname (list_t *dst,
char *s, size_t *index)
{
// e.g. "[someAttr=somevalue]"
// ^^^^^^^^ <------ consume and returns only that
(*index)++;
return
_selector_read_attr (dst, delimited_string (s, index, SDELIMS), s, index);
}
static enum selector_type_t selector_read_op (list_t *dst,
char *s, size_t *index)
{
// e.g. "s1 > s2"
// ^ <---- consume and returns only that
// e.g. "[someAttr ~ someValue]"
// ^ <-------- consume and returns only that
if (!(list_append(dst, selector_new (selector_OPERATOR, s[*index], NULL)))) {
FPRINTF (stderr, "Failed to create operator %c\n", s[*index]);
return false;
}
(*index)++;
return true;
}
static enum selector_type_t selector_read_match (list_t *dst,
char *s, size_t *index)
{
// e.g. [someAttr^=someValue]
// ^ <-------------- returns only that
// Consumes ----> ^^
if (!(list_append(dst, selector_new (selector_OPERATOR, s[*index], NULL)))) {
FPRINTF (stderr, "Failed to create operator %c\n", s[*index]);
return false;
}
(*index)++;
return true;
}
static enum selector_type_t selector_read_string (list_t *dst,
char *s, size_t *index)
{
char *string = parse_string (s, index);
if (!(list_append (dst, selector_new (selector_STRING, 0, string)))) {
FPRINTF (stderr, "Failed to read string into selector_t list\n");
return false;
}
return true;
}
static enum selector_type_t selector_read_tag (list_t *dst,
char *s, size_t *index)
{
return _selector_read_attr (dst, "\x02tagname", s, index);
}
/* **************************************************************
* Reads exactly one selector_t object, and inserts it into the
* provided list.
*/
static enum selector_type_t _selector_read_next (list_t *dst,
char *sq, size_t *index)
{
// Swallow whitespace
while ((is_space (sq[*index]))) {
(*index)++;
}
// Reached end of input?
if (!(sq[*index])) {
return list_append (dst, selector_new (selector_END, 0, NULL))
? selector_END
: selector_ERROR;
}
enum selector_type_t type = selector_ERROR;;
// Determine what we are dealing with
switch (sq[*index]) {
case '.': type = selector_read_class (dst, sq, index); break;
case '#': type = selector_read_id (dst, sq, index); break;
case '[': type = selector_read_attrname (dst, sq, index); break;
case ',': // Fallthrough
case '~': // Fallthrough
case '>': // Fallthrough
case '=': // Fallthrough
case '+': type = selector_read_op (dst, sq, index); break;
case '$': // Fallthrough
case '*': // Fallthrough
case '^': type = selector_read_match (dst, sq, index); break;
case '"': // Fallthrough
case '\'': type = selector_read_string (dst, sq, index); break;
default: type = selector_read_tag (dst, sq, index); break;
}
return type;
}
static list_t *selector_parse (const char *sq)
{
bool error = true;
size_t index = 0;
char *copy = sstrdup (sq);
if (!copy) {
FPRINTF (stderr, "Failed to create a copy of the input\n");
return NULL;
}
list_t *ret = list_new ((void (*) (void *))selector_del);
if (!ret) {
FPRINTF (stderr, "Failed to create list of selector operations for [%s]\n",
copy);
goto cleanup;
}
enum selector_type_t type;
while ((type = _selector_read_next (ret, copy, &index)) > 0) {
;
}
if (type != selector_END) {
FPRINTF (stderr, "Encountered errors while processing [%s]\n", copy);
goto cleanup;
}
error = false;
cleanup:
free (copy);
if (error) {
list_free (&ret);
}
return ret;
}
/* **************************************************************
* The main program module that contains `int main()`.
*/
static int process_query (const char *query, const node_t *tree)
{
return 1;
}
static int process_file (const char *fname, list_t *queries)
{
int nerrs = 0;
node_t *tree = NULL;
if (!(tree = node_read (fname, &nerrs))) {
FPRINTF (stderr, "Failed to parse input from [%s]\n", fname);
nerrs++;
goto cleanup;
}
for (size_t i=0; i<queries->nitems; i++) {
// Parallelisation also easily possible right here. This is
// a better spot than the loop in `main()`.
int rc = process_query ((const char *)queries->items[i], tree);
if (rc) {
const char *s_err = "error";
if (rc > 1) {
s_err = "errors";
}
FPRINTF (stderr, "%i %s found while processing [%s:%s]. Ignoring\n",
rc, s_err, fname, (const char *)queries->items[i]);
}
}
cleanup:
node_del (tree);
return nerrs;
}
static void print_helpmsg (void)
{
static const char *helpmsg[] = {
"NAME",
" HtmlQuery: a program to perform queries and minimal transformations.",
" on HTML input.",
"",
"SYNPOSIS",
" htmlq [[-q <query-string>] ...] [-f <filename>] ...",
" htmlq [[-q <query-string>] ...]",
" htmlq -h",
"",
"DESCRIPTION",
" Zero or more query expressions must be specified with [-q query-string].",
" When no query expressions are found, nothing is returned. Zero or more",
" input files can be specified with [-f <filename>]. When no input files",
" are specified, input is read from stdin.",
"",
" HtmlQuery returns the number of errors encountered to the caller.",
"",
"OPTIONS",
" -f <filename> Path to the input file. This option can be repeated as",
" many times as necessary to process multiple files at the",
" same time.",
" -s <query-string> Specify the query-string to match. This option can be",
" repeated to specify multiple query-strings. A match is",
" made when *any* query-string matches.",
" -h Print this message and exit with a zero exit code.",
" -v Print the program version and exit with a zero exit code.",
"",
"BUGS",
" Very likely. Send bug reports to lee@rundata.co.za, with the title set",
" to 'bug-report: htmlq' or similar.",
"",
};
for (size_t i=0; i<sizeof helpmsg/sizeof helpmsg[i]; i++) {
printf ("%s\n", helpmsg[i]);
}
}
int main (int argc, char **argv)
{
int ret = EXIT_FAILURE;
list_t *files = list_new (free);
list_t *queries = list_new (free);
if (!files || !queries) {
FPRINTF (stderr, "Failed to allocate arrays [files:queries] [%p:%p]\n",
files, queries);
goto cleanup;
}
int tokeniser_test (void);
int selector_test (void);
// Parse all the options
int argv_index = 0;
for (argv_index=1; argv_index<argc && argv[argv_index]; argv_index++) {
if ((argv[argv_index][0]) != '-') {
break;
}
switch (argv[argv_index][1]) {
case '1':
ret = tokeniser_test ();
goto cleanup;
case '2':
ret = selector_test ();
goto cleanup;
case 'f':
if (!(list_append (files, sstrdup (argv[++argv_index])))) {
FPRINTF (stderr, "OOM error storing option [-f %s]\n",
argv[argv_index]);
goto cleanup;
}
break;
case 'q':
if (!(list_append (queries, sstrdup (argv[++argv_index])))) {
FPRINTF (stderr, "OOM error storing query-string [-s %s]\n",
argv[argv_index]);
goto cleanup;
}
break;
case 'h':
print_helpmsg ();
ret = EXIT_SUCCESS;
goto cleanup;
case 'v':
printf ("Starting htmlq %s\n", VERSION);
break;
default:
FPRINTF (stderr, "Unrecognised option flag '%s'\n", argv[argv_index]);
goto cleanup;
}
}
ret = 0;
for (size_t i=0; i<files->nitems; i++) {
// Can process this in parallel, but why bother?
int nerrs = process_file ((const char *)files->items[i], queries);
if (nerrs) {
const char *s_err = "error";
if (nerrs > 1) {
s_err = "errors";
}
FPRINTF (stderr, "%i %s processing file [%s]. Ignoring\n",
nerrs, s_err, (const char *)files->items[i]);
ret += nerrs;
}
}
cleanup:
list_free (&files);
list_free (&queries);
return ret;
}
int tokeniser_test (void)
{
char *input = sstrdup (
"<!DOCTYPE attr1 attr2=\"value2\">"
"<html >"
" <body attr3='value3'>"
" <!-- this is a comment"
" -->"
" <tag1 attr4='value4'/>"
" <tag2 attr5='value5' />"
" <tag3 attr6=value6/>"
" <tag4 > some text goes here <tag5> more text </tag5> </tag4>"
" <tag6 attr7> </tag6>"
" <tag7 attr8=value8> </tag7>"
" <script type=module src=where.js>"
" let tmp = \" </script> \";"
" let tmp = '\"</script>\"';"
" </script>"
" <tag8 attr9=value-10 > </tag8>"
" <tag9/>"
" <p>"
" </body>"
" <!--another comment-->"
"</html >"
);
static const char *expected =
"T token_TAGOPEN[!DOCTYPE]\n"
"T token_TEXT[attr1]\n"
"T token_KP[attr2=\"value2\"]\n"
"T token_GT[>]\n"
"T token_TAGOPEN[html]\n"
"T token_GT[>]\n"
"T token_TAGOPEN[body]\n"
"T token_KP[attr3='value3']\n"
"T token_GT[>]\n"
"T token_TAGOPEN[!--]\n"
"T token_TEXT[this]\n"
"T token_TEXT[is]\n"
"T token_TEXT[a]\n"
"T token_TEXT[comment]\n"
"T token_TEXT[--]\n"
"T token_GT[>]\n"
"T token_TAGOPEN[tag1]\n"
"T token_KP[attr4='value4']\n"
"T token_SELFCLOSING[/>]\n"
"T token_TAGOPEN[tag2]\n"
"T token_KP[attr5='value5']\n"
"T token_SELFCLOSING[/>]\n"
"T token_TAGOPEN[tag3]\n"
"T token_TEXT[attr6=value6]\n"
"T token_SELFCLOSING[/>]\n"
"T token_TAGOPEN[tag4]\n"
"T token_GT[>]\n"
"T token_TEXT[some]\n"
"T token_TEXT[text]\n"
"T token_TEXT[goes]\n"
"T token_TEXT[here]\n"
"T token_TAGOPEN[tag5]\n"
"T token_GT[>]\n"
"T token_TEXT[more]\n"
"T token_TEXT[text]\n"
"T token_TAGCLOSE[tag5]\n"
"T token_TAGCLOSE[tag4]\n"
"T token_TAGOPEN[tag6]\n"
"T token_TEXT[attr7]\n"
"T token_GT[>]\n"
"T token_TAGCLOSE[tag6]\n"
"T token_TAGOPEN[tag7]\n"
"T token_TEXT[attr8=value8]\n"
"T token_GT[>]\n"
"T token_TAGCLOSE[tag7]\n"
"T token_IGNOPEN[script]\n"
"T token_TEXT[type=module]\n"
"T token_TEXT[src=where.js]\n"
"T token_GT[>]\n"
"T token_TEXT[let]\n"
"T token_TEXT[tmp]\n"
"T token_TEXT[=]\n"
"T token_TEXT[ </script> ]\n"
"T token_TEXT[;]\n"
"T token_TEXT[let]\n"
"T token_TEXT[tmp]\n"
"T token_TEXT[=]\n"
"T token_TEXT[\"</script>\"]\n"
"T token_TEXT[;]\n"
"T token_IGNCLOSE[script]\n"
"T token_TAGOPEN[tag8]\n"
"T token_TEXT[attr9=value-10]\n"
"T token_GT[>]\n"
"T token_TAGCLOSE[tag8]\n"
"T token_TAGOPEN[tag9]\n"
"T token_SELFCLOSING[/>]\n"
"T token_TAGOPEN[p]\n"
"T token_GT[>]\n"
"T token_TAGCLOSE[body]\n"
"T token_TAGOPEN[!--another]\n"
"T token_TEXT[comment--]\n"
"T token_GT[>]\n"
"T token_TAGCLOSE[html]\n"
"Found 73 tokens\n";
char *output = NULL;
struct token_t *token = NULL;
size_t index = 0;
size_t ntokens = 0;
while ((token = next_token (input, &index))
&& token->type != token_END) {
ntokens++;
tprintf (&output, "T %s[%s]\n", token_type_name (token->type), token->text);
token_del (token);
}
tprintf (&output, "Found %zu tokens\n", ntokens);
int ret = EXIT_FAILURE;
if (token && token->type == token_END) {
FPRINTF (stderr, "TEST: Success\n");
ret = EXIT_SUCCESS;
} else {
FPRINTF (stderr, "TEST: Failure\n");
}
token_del (token);
free (input);
if ((strcmp (expected, output)) != 0) {
FPRINTF (stderr, "Unexpected output. Expected:\n%s\nGot:\n%s\n",
expected, output);
}
free (output);
return ret;
}
int selector_test (void)
{
int ret = EXIT_FAILURE;
static const char *tests[] = {
"one",
".two",
"#three",
"four > five",
"#six ~ .seven",
};
static const char *expected =
"[selector_OPERAND:\x02tagname]\n"
"[selector_OPERATOR:=]\n"
"[selector_STRING:one]\n"
"[selector_END:...]\n"
"[selector_OPERAND:class]\n"
"[selector_OPERATOR:=]\n"
"[selector_STRING:two]\n"
"[selector_END:...]\n"
"[selector_OPERAND:id]\n"
"[selector_OPERATOR:=]\n"
"[selector_STRING:three]\n"
"[selector_END:...]\n"
"[selector_OPERAND:\x02tagname]\n"
"[selector_OPERATOR:=]\n"
"[selector_STRING:four]\n"
"[selector_END:...]\n"
"[selector_OPERAND:id]\n"
"[selector_OPERATOR:=]\n"
"[selector_STRING:six]\n"
"[selector_END:...]\n";
static const size_t ntests = sizeof tests / sizeof tests[0];
char *output = NULL;
list_t *selectors = NULL;
for (size_t i=0; i<ntests; i++) {
list_free (&selectors);
selectors = selector_parse (tests[i]);
if (!selectors) {
FPRINTF (stderr, "Failed to create list for selector objects\n");
goto cleanup;
}
for (size_t j=0; j<selectors->nitems; j++) {
selector_dump (selectors->items[j], &output);
}
}
const char *tmp = output;
size_t processed = strdiff (&expected, &tmp);
if (processed) {
FPRINTF (stderr, "processed before error: %zu characters\n", processed);
FPRINTF (stderr, "Unexpected output. Expected:\n%s\nGot:\n%s\n",
expected, tmp);
}
ret = EXIT_SUCCESS;
cleanup:
free (output);
list_free (&selectors);
return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment