lelanthran/htmlq.c

## htmlq.c
/* ********************************************************
 * Copyright ©2024 Rundata Systems.  All rights reserved.
 * This project is licensed under the GPLv3 License. You
 * can find a copy of this license at:
 *    https://www.gnu.org/licenses/gpl-3.0.en.html
 */

#warning TODO: Split this into multiple files
#warning INCOMPLETE: Implement searching using compiled query

/* ********************************************************
 * I call this program from shell scripts that scrape web-pages.
 *
 * It pairs quite nicely with curl: fetch a URL with curl and
 * search it with htmlq.
 */


/* ********************************************************
 * Compiled with:
 *       gcc -W -Wall -Wextra -g htmlq.c -o htmlq
 *
 * The easiest way to execute the compile command above is by
 * copying and pasting it into the command-line.
 */

// Standard headers
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <inttypes.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>


#define FPRINTF(f,...)    do {\
   fprintf (f, "%s:%i in %s(): ", __FILE__, __LINE__, __func__);\
   fprintf (f, __VA_ARGS__);\
} while (0);

#define VERSION "0.0.1"


/* ********************************************************
 * util.c module
 */
static bool tprintf (char **dst, const char *fmts, ...)
{
   va_list ap, ap_copy;
   va_start (ap, fmts);
   va_copy (ap_copy, ap);

   size_t curlen = *dst ? strlen (*dst) : 0;
   size_t newlen = vsnprintf (NULL, 0, fmts, ap_copy);
   char *tmp = realloc (*dst, curlen + newlen + 10);
   if (!tmp) {
      FPRINTF (stderr, "OOM error reallocating formatted string\n");
      return false;
   }
   *dst = tmp;
   vsprintf (&tmp[curlen], fmts, ap);
   va_end (ap);
   return true;
}

static char *sstrdup (const char *src)
{
   if (!src)
      return NULL;
   char *ret = NULL;
   size_t nbytes = strlen (src) + 1;
   if (!(ret = malloc (nbytes))) {
      FPRINTF (stderr, "OOM error allocating new string from [%s]\n", src);
      return NULL;
   }

   return strcpy (ret, src);
}

size_t strdiff (const char **lhs, const char **rhs)
{
   size_t count = 0;
   while (lhs && rhs && *lhs && *rhs && **rhs && **lhs && **lhs == **rhs) {
      (*lhs)++;
      (*rhs)++;
      count++;
   }
   if  (**lhs == **rhs) {
      return 0;
   }
   return count;
}


static bool sstricmp (const char *lhs, const char *rhs)
{
   if (!lhs || !rhs)
      return false;

   size_t len1 = strlen (lhs),
          len2 = strlen (rhs);
   if (len1 != len2)
      return false;

   for (size_t i=0; i<len1; i++) {
      if ((tolower (lhs[i]) != tolower (rhs[i])))
         return false;
   }

   return true;
}

static bool is_space (int c)
{
   return isspace (c) ? true : false;
}

static bool not_space (int c)
{
   return isspace (c) || c == 0 ? false : true;
}

static char *file_slurp (const char *fname, size_t *len)
{
   bool error = true;
   char *ret = NULL;
   FILE *inf = NULL;

   if (!(inf = fopen (fname, "r"))) {
      FPRINTF (stderr, "Failed to open [%s] for reading: %m\n", fname);
      goto cleanup;
   }

   if ((fseek (inf, 0, SEEK_END)) != 0) {
      FPRINTF (stderr, "Error setting file position: %m\n");
      goto cleanup;
   }
   long flen = ftell (inf);
   if (!(ret = calloc (flen + 1, 1))) {
      FPRINTF (stderr, "OOM error allocating buffer for file\n");
      goto cleanup;
   }
   if ((fseek (inf, 0, SEEK_SET)) != 0) {
      FPRINTF (stderr, "Error setting file position: %m\n");
      goto cleanup;
   }

   size_t nbytes = fread (ret, 1, flen, inf);
   if (nbytes != (size_t)flen) {
      FPRINTF (stderr, "Unexpected number of bytes read in [%zu vs %li]: %m\n",
               nbytes, flen);
      goto cleanup;
   }

   if (len) {
      *len = nbytes;
   }

   error = false;
cleanup:
   if (inf) {
      fclose (inf);
   }

   if (error) {
      free (ret);
      ret = NULL;
   }

   return ret;
}


/* ********************************************************
 * list.c module
 * A datatype for arrays of *things* ... with a deallocate
 * function compatible with `free`.
 *
 * The list is strictly ordered. The order stored and returned is
 * exactly the same as the order of `append` calls.
 */
typedef struct {
   void **items;
   size_t nitems;
   void (*dealloc) (void *);
} list_t;

static void list_free (list_t **list)
{
   if (!list || !*list)
      return;

   for (size_t i=0; (*list)->dealloc && i < (*list)->nitems; i++) {
      (*list)->dealloc ((void *)(*list)->items[i]);
   }

   free ((void *)(*list)->items);
   free (*list);
   *list = NULL;
}

static list_t *list_new (void (*dealloc) (void *))
{
   list_t *ret = calloc (1, sizeof *ret);
   if (!ret) {
      FPRINTF (stderr, "OOM error allocating list_t\n");
      return NULL;
   }
   ret->dealloc = dealloc;
   return ret;
}

static const void *list_append (list_t *list, const void *item)
{
   void **tmp = realloc (list->items, (list->nitems + 1) * sizeof *list->items);
   if (!tmp) {
      FPRINTF (stderr, "OOM error reallocating list_t array\n");
      return NULL;
   }
   list->items = tmp;
   list->items[list->nitems++] = (void *)item;
   return item;
}


/* ********************************************************
 * node.c module
 * A datatype for trees of nodes. There is no difference
 * between a `tree` and a `node` - they're the same thing.
 *
 * A node stores an item from an HTML page, and is created from
 * parsing literal HTML passed to the `_new()` function.
 *
 * Children of a node have a strict ordering, and this is the
 * order that recursion on the structure is guaranteed.
 */
enum node_type_t {
   node_type_NODE,
   node_type_STRING,
};

typedef struct node_t node_t;
struct node_t {
   enum node_type_t type;
   char *tag;
   node_t *parent;
   list_t *keys;     // char *, keys[i] maps to values [i]
   list_t *values;   // char *, values[i] maps to keys [i]
   list_t *children; // node_t *, Children, stored in order of appearance.
};

char *node_ancestry (char **dst, node_t *node)
{
   char *ret = NULL;
   node_t *tmp = node;
   size_t nbytes = 0;
   while (tmp) {
      nbytes += strlen (tmp->tag) + strlen ("] --> [") + 3;
      tmp = tmp->parent;
   }
   nbytes++;

   if (!(ret = calloc (nbytes, 1))) {
      FPRINTF (stderr, "OOM error allocating space for ancestry\n");
      return NULL;
   }

   tmp = node;
   size_t index = 0;
   const char *delim = "";
   while (tmp) {
      sprintf (&ret[index], "%s[%s]", delim, tmp->tag);
      delim = " --> ";
      index += strlen (&ret[index]);
      tmp = tmp->parent;
   }

   free (*dst);
   *dst = ret;
   return ret;
}

static void print_node (node_t *node, size_t depth, void *outf)
{
   FILE *f = outf;
   if (!f)
      f = stdout;

   if (node->type == node_type_STRING) {
      fprintf (f, "%s ", node->tag);
      return;
   }
#define INDENT    for (size_t i=0; i<depth; i++) { fprintf (f, "   "); }

   INDENT;
   fprintf (f, "<%s ", node->tag);
   for (size_t i=0; i<node->keys->nitems; i++) {
      fprintf (f, "%s='%s' ",
               (char *)node->keys->items[i], (char *)node->values->items[i]);
   }
   fprintf (f, ">\n");
}

static void node_del (node_t *node)
{
   if (!node)
      return;

   free (node->tag);
   list_free (&node->keys);
   list_free (&node->values);
   list_free (&node->children);

   free (node);
}

static bool node_add_child (node_t *parent, node_t *child)
{
   if (!parent)
      return true;

   if (!child)
      return false;

   if (parent->type != node_type_NODE) {
      FPRINTF (stderr, "Attempt to append child onto a data elementis invalid\n");
      return false;
   }

   child->parent = parent;
   if (!(list_append (parent->children, child))) {
      FPRINTF (stderr, "Failed to attach child to parent\n");
      return false;
   }

   return true;
}

static bool node_add_attr (node_t *node, char *attr)
{
   char *key = attr;
   char *value = strchr (key, '=');
   if (!value) {
      value = "";
   } else {
      *value++ = 0;
   }

   if (value[0] == '\'' || value[0] == '"') {
      *value++ = 0;
      value[strlen (value) - 1] = 0;
   }

   const char *akey, *avalue;
   if (!(akey = list_append (node->keys, sstrdup (key)))
         || !(avalue = list_append (node->values, sstrdup (value)))) {
      FPRINTF (stderr, "Failed to store [%s=%s] for node [%s]\n",
               key, value, node->tag);
      return false;
   }

   return true;
}


node_t *node_new (node_t *parent, enum node_type_t type, const char *data)
{
   bool error = true;

   node_t *ret = calloc (1, sizeof *ret);
   if (!ret) {
      FPRINTF (stderr, "OOM error allocating node [%s]\n", data);
      goto cleanup;
   }

   if (!(node_add_child (parent, ret))) {
      FPRINTF (stderr, "Failed to add child to parent [%s]\n", data);
      goto cleanup;
   }

   ret->type = type;

   if (!(ret->tag = sstrdup (data))) {
      FPRINTF (stderr, "Failed to allocate tag [%s]", data);
      goto cleanup;
   }

   if (type == node_type_NODE) {
      ret->keys = list_new (free);
      ret->values = list_new (free);
      ret->children = list_new ((void (*) (void *))node_del);
      if (!ret->keys || !ret->values || !ret->children) {
         FPRINTF (stderr, "Failed to allocate fields [keys:values:children] "
               "[%p:%p:%p]\n", ret->keys, ret->values, ret->children);
         goto cleanup;
      }
   }

   error = false;

cleanup:
   if (error) {
      node_del (ret);
      ret = NULL;
   }
   return ret;
}

static void node_visit (struct node_t *node, size_t depth, void *parg,
                        void (*fptr) (struct node_t *, size_t, void *))
{
   if (!node)
      return;
   fptr (node, depth, parg);
   if (node->type == node_type_STRING)
      return;

   for (size_t i=0; i < node->children->nitems; i++) {
      node_visit (node->children->items[i], depth + 1, parg, fptr);
   }
}


/* *************************************************************************
 * token.c module
 * The token datatype, and some operations for it.
 */

enum token_type_t {
   token_END,
   token_TAGOPEN,
   token_TAGCLOSE,
   token_IGNOPEN,
   token_IGNCLOSE,
   token_TEXT,
   token_KP,
   token_GT,
   token_SELFCLOSING,
};

static const char *token_type_name (enum token_type_t type)
{
   static const struct {
      enum token_type_t t;
      const char *s;
   } types[] = {
#define TOKEN_NAME(x)      { x, #x }
      TOKEN_NAME(token_END),
      TOKEN_NAME(token_TAGOPEN),
      TOKEN_NAME(token_TAGCLOSE),
      TOKEN_NAME(token_IGNOPEN),
      TOKEN_NAME(token_IGNCLOSE),
      TOKEN_NAME(token_TEXT),
      TOKEN_NAME(token_KP),
      TOKEN_NAME(token_GT),
      TOKEN_NAME(token_SELFCLOSING),
   };
#undef TOKEN_NAME
   static size_t ntypes = sizeof types/sizeof types[0];
   for (size_t i=0; i<ntypes; i++) {
      if (types[i].t == type)
         return types[i].s;
   }

   static char unknown[55];
   snprintf (unknown, sizeof unknown, "Unknown token type %i\n", type);
   return unknown;
}

struct token_t {
   enum token_type_t type;
   char *text;
};

static struct token_t *token_new (enum token_type_t type, char *start, char *end)
{
   struct token_t *ret = calloc (1, sizeof *ret);
   if (!ret) {
      FPRINTF (stderr, "OOM error allocating token_t\n");
      return NULL;
   }

   if (!start || !end) {
      ret->text = sstrdup ("");
      ret->type = type;
      return ret;
   }

   size_t slen = end - start;
   if (!(ret->text = calloc (slen + 1, 1))) {
      FPRINTF (stderr, "OOM error allocating token->text\n");
      free (ret);
      return NULL;
   }
   memcpy (ret->text, start, slen);
   ret->type = type;
   return ret;
}

static void token_del (struct token_t *token)
{
   if (!token)
      return;
   free (token->text);
   free (token);
}

/* *************************************************************************
 * The actual tokeniser.
 */

static void token_set_ignoretag (struct token_t *token)
{
   static const char *items[] = {
      "SCRIPT", "STYLE",
   };
   static const size_t nitems = sizeof items/sizeof items[0];

   if (!token)
      return;

   for (size_t i=0; i < nitems; i++) {
      if ((sstricmp (items[i], token->text))) {
         if (token->type == token_TAGOPEN)
            token->type = token_IGNOPEN;
         if (token->type == token_TAGCLOSE)
            token->type = token_IGNCLOSE;
         break;
      }
   }
}

static struct token_t *read_token_tag (char *in, size_t *idx)
{
   (*idx)++;
   char *start = &in[*idx];;
   while ((not_space(in[*idx])) && in[*idx] != '>') {
      (*idx)++;
   }

   if ((&in[*idx] - start) < 1) {
      FPRINTF (stderr, "Empty tag found, aborting\n");
      return NULL;
   }

   if (in[(*idx) - 1] == '/') {
      (*idx)--;
   }

   enum token_type_t tagtype = token_TAGOPEN;
   if (start[0] == '/') {
      tagtype = token_TAGCLOSE;
      start++;
      while ((is_space (in[*idx]))) {
         in[*idx] = 0;
         (*idx)++;
      }
      in[*idx] = 0;
      (*idx)++;
   }

   struct token_t *ret = token_new (tagtype, start, &in[*idx]);
   token_set_ignoretag (ret);
   return ret;
}

static bool read_ffwd_quote (char *in, size_t *idx)
{
   int quotec = in[*idx];
   while (1) {
      (*idx)++;
      int c = in[*idx];

      if (c == '\\') {
         (*idx)++;
         if (in[*idx] == 0)
            return false;
         (*idx)++;
         continue;
      }

      if (c == 0)
         return false;

      if (c == quotec) {
         (*idx)++;
         return true;
      }
   }
   return false;
}

static struct token_t *read_token_char (enum token_type_t type,
                                        char *in, size_t *idx)
{
   (*idx)++;
   return token_new (type, &in[(*idx) - 1], &in[*idx]);
}

static struct token_t *read_token_text (char *in, size_t *idx)
{
   static const char *delims = "<=>";

   char *start = &in[*idx];
   while ((not_space (in[*idx])) && !(strchr (delims, in[*idx]))) {
      (*idx)++;
   }
   if ((is_space (in[*idx]))) {
      return token_new (token_TEXT, start, &in[*idx]);
   }

   if (in[*idx] == '=') {
      (*idx)++;
      if (in[*idx] == '"' || in[*idx] == '\'') {
         if (!(read_ffwd_quote (in, idx))) {
            return NULL;
         }

         return token_new (token_KP, start, &in[*idx]);
      }
      while ((not_space (in[*idx]))
            && in[*idx] != '>'
            && in[*idx] != '/') {
         (*idx)++;
      }
      return token_new (token_TEXT, start, &in[*idx]);
   }

   return token_new (token_TEXT, start, &in[*idx]);
}

static struct token_t *read_token_selfclose (char *in, size_t *idx)
{
   char *start = &in[*idx];
   char *end = &in[(*idx) + 1];
   while ((is_space (*end))) {
      end++;
   }

   if (*end != '>') {
      return read_token_text (in, idx);
   }
   (*idx) += (end - start) + 1;

   return token_new (token_SELFCLOSING, start, &in[*idx]);
}

static char *parse_string (char *in, size_t *idx)
{
   (*idx)++;
   char *start = &in[*idx];
   char qc = *(start - 1);

   while ((*idx)++) {
      int c = in[*idx];
      if (c == 0) {
         FPRINTF (stderr, "Unterminated string near [%s]\n", start);
         return NULL;
      }
      if (c == '\\') {
         (*idx)++;
         continue;
      }
      if (c == qc) {
         break;
      }
   }
   in[*idx] = 0;
   (*idx)++;
   return start;
}

static struct token_t *read_token_string (char *in, size_t *idx)
{
   char *s = parse_string (in, idx);
   char *e = &s[strlen(s)];
   return token_new (token_TEXT, s, e);
}

static struct token_t *next_token (char *in, size_t *idx)
{
   // Swallow whitespace
   while (is_space (in[*idx]))
      (*idx)++;

   // The first character indicates what we are dealing with.
   switch (in[*idx]) {
      case 0:
         return token_new (token_END, NULL, NULL);
      case '<':   return read_token_tag (in, idx);
      case '>':   return read_token_char (token_GT, in, idx);
      case '/':   return read_token_selfclose (in, idx);
      case '`':   // Fallthrough
      case '\'':  // Fallthrough
      case '"':   return read_token_string (in, idx);
      default:    return read_token_text (in, idx);
   }
   return NULL;
}


// This is a very quick-n-dirty tokeniser/parser combination. Designing
// a nice one is involved, as each character has to be examined in order
// to keep track of character positions within a line and line positions
// within the input.
//
// While a "nice" tokeniser/parser is easy, it's also tedious. The trade-
// off is, unfortunately, poor error reporting when parsing (because
// no character position or line number information is available).
//
// In the absence of line number and character positions, we can still
// report "error near 'foo bar blaz ...'" and let the user perform a
// search for that string.
//
enum recurse_action_t {
   recurse_action_SUCCESS = 1,
   recurse_action_ERROR,
   recurse_action_SKIP,
};

static enum recurse_action_t node_read_recurse (const char *rootname, bool preamble,
                                                node_t *parent, char *input,
                                                size_t *index, int *nerrors)
{
   int nerrs = 0;
   struct token_t *token = NULL;
   enum recurse_action_t action;
   char *ancestors = NULL;

   /* Parsing HTML into a tree is stupidly tedious. Here are every
    * type of token we will ever see:
    *    END:           Dummy token returned on end-of-input
    *    TAGOPEN:       Token of a tag. Strips out '<' and '>'
    *    TAGCLOSE:      Token of a /tag. Strips out "</" and '>'
    *    IGNOPEN:       Token of a tag we will ignore. Strips out '<' and '>'
    *    IGNCLOSE:      Token of a /tag we will ignore. Strips out "</" and '>'
    *    TEXT:          Normal content encountered
    *    KP:            keypair value (either k=v, k='v' or k="v")
    *    GT:            The '>' character
    *    SELFCLOSING:   The "/>" two-character string.
    *
    * Here's the actions we will take for each token encountered:
    *
    * END:
    *    We end processing immediately, signalling error if parent is not
    *    named "<root>"
    * TAGOPEN:
    *    If preamble, signal error and return
    *    If !preamble, create a newnode, call recurse(newnode)
    * TAGCLOSE:
    *    if preamble, signal error and return
    *    If !preamble
    *       If token->text != parent->tag
    *          warn, then return success
    *       else
    *          return success
    * IGNOPEN:
    *    If preamble, signal error and return
    *    If !preamble, read and discard tokens until IGNCLOSE
    * IGNCLOSE:
    *    signal error and return
    * TEXT and KP
    *    If preamble we attach token as attr to parent
    *    If !preamble we attach token as text/content to parent
    * GT:
    *    If preamble
    *       If token->text does not start with with a '!'
    *          we set preamble to false
    *       else
    *          we inform caller to SKIP
    *    If !preamble we signal error and return
    * SELFCLOSING:
    *    If preamble we return success
    *    if !preamble we signal error and return
    */
    while ((token = next_token (input, index))
         && token->type != token_END) {
      node_t *newnode = NULL;
      FPRINTF (stdout, "<%s> (%i) %s[%s]\n",
            parent->tag, preamble, token_type_name (token->type), token->text);

      if (preamble) {
         switch (token->type) {
            case token_END:
               FPRINTF (stderr, "%s: Encountered EOF within attrs of [%s]\n",
                        node_ancestry(&ancestors, parent), parent->tag);
               nerrs++;
               goto cleanup;

            case token_TAGOPEN:  // Fallthrough
            case token_IGNOPEN:
               FPRINTF (stderr, "%s: Encountered tag [%s] within attrs of [%s]\n",
                        node_ancestry (&ancestors, parent),
                        token->text, parent->tag);
               nerrs++;
               goto cleanup;

            case token_TAGCLOSE: // Fallthrough
            case token_IGNCLOSE:
               FPRINTF (stderr, "%s: Encountered '</%s>' within attrs of [%s]\n",
                        node_ancestry (&ancestors, parent),
                        token->text, parent->tag);
               nerrs++;
               goto cleanup;

            case token_TEXT:  // Fallthrough
            case token_KP:
               if (!(node_add_attr (parent, token->text))) {
                  FPRINTF (stderr, "%s: Failed to add attr [%s] to [%s]\n",
                           node_ancestry (&ancestors, parent),
                           token->text, parent->tag);
                  nerrs++;
                  goto cleanup;
               }
               break;

            case token_GT:
               if (parent->tag[0] == '!') {
                  nerrs = -1;
                  goto cleanup;
               } else {
                  preamble = false;
               }
               break;

            case token_SELFCLOSING:
               goto cleanup;
         }
      } else {
         switch (token->type) {
            case token_END:
               if ((strcmp (parent->tag, "root")) != 0) {
                  FPRINTF (stderr, "%s: Encountered EOF while processing [%s]\n",
                           node_ancestry (&ancestors, parent),
                           parent->tag);
                  nerrs++;
               }
               goto cleanup;

            case token_TAGOPEN:
               if (!(newnode = node_new (parent, node_type_NODE, token->text))) {
                  FPRINTF (stderr, "%s: Failed to create new node [%s]\n",
                        node_ancestry (&ancestors, parent),
                        token->text);
                  nerrs++;
                  goto cleanup;
               }
               action = node_read_recurse (rootname, true,
                                           newnode, input, index, nerrors);
               if (action == recurse_action_ERROR) {
                  FPRINTF (stderr, "%s: Failed to read node [%s]\n",
                        node_ancestry (&ancestors, parent),
                        newnode->tag);
                  nerrs++;
                  goto cleanup;
               }
               if (action == recurse_action_SKIP) {
               }
               break;

            case token_TAGCLOSE:
               if ((strcmp (token->text, parent->tag)) != 0) {
                  FPRINTF (stderr, "WARNING: %s: Expected </%s>, got </%s>\n",
                           node_ancestry (&ancestors, parent),
                           token->text, parent->tag);
               }
               goto cleanup;

            case token_IGNOPEN:
               token_del (token);
               while ((token = (next_token (input, index)))) {
                  if (token->type == token_END) {
                     FPRINTF (stderr, "%s: Unexpected end of input\n",
                           node_ancestry (&ancestors, parent));
                     nerrs++;
                     goto cleanup;
                  }

                  FPRINTF (stdout, "TAG_IGNOPEN: <%s> (%i) %s[%s]\n",
                        parent->tag,
                        preamble,
                        token_type_name (token->type), token->text);

                  if (token->type == token_IGNCLOSE) {
                     break;
                  }

                  token_del (token);
                  token = NULL;
               }
               if (!token) {
                  FPRINTF (stderr, "%s: Unexpected end of input looking for "
                                   "ignoretag",
                                   node_ancestry (&ancestors, parent));

                  nerrs++;
                  goto cleanup;
               }
               break;


            case token_IGNCLOSE:
                  FPRINTF (stderr, "%s: unexpected </%s>\n",
                           node_ancestry (&ancestors, parent), token->text);
                  nerrs++;
                  goto cleanup;

            case token_TEXT:  // Fallthrough
            case token_KP:
               if (!(newnode = node_new (parent, node_type_STRING, token->text))) {
                  FPRINTF (stderr, "%s: Failed to create new text [%s]\n",
                        node_ancestry (&ancestors, parent),
                        token->text);
                  nerrs++;
                  goto cleanup;
               }
               break;

            case token_GT: // Fallthrough
            case token_SELFCLOSING:
               FPRINTF (stderr, "%s: Unexpected [%s]\n",
                     node_ancestry (&ancestors, parent),
                     token->text);
               nerrs++;
               goto cleanup;
         }
      }
      token_del (token);
   }

   if (nerrors) {
      *nerrors = *nerrors + nerrs;
   }

cleanup:
   free (ancestors);
   token_del (token);
   if (nerrs > 0) {
      return recurse_action_ERROR;
   }
   if (nerrs < 0) {
      return recurse_action_SKIP;
   }

   return recurse_action_SUCCESS;
}


static node_t *node_read (const char *fname, int *nerrors)
{
   int nerrs = 0;

   node_t *root = NULL;
   size_t input_len = 0;
   char *input = file_slurp (fname, &input_len);
   if (!input) {
      FPRINTF (stderr, "Failed to read file into memory, aborting file read\n");
      nerrs++;
      goto cleanup;
   }

   size_t index = 0;
   if (!(root = node_new (NULL, node_type_NODE, fname))) {
      FPRINTF (stderr, "Failed to create a root node for [%s]\n", fname);
      goto cleanup;
   }
   enum recurse_action_t action = recurse_action_SKIP;
   while ((action = node_read_recurse (fname, false, root, input, &index, &nerrs))
            == recurse_action_SKIP) {
      ;
   }
   if (action == recurse_action_ERROR) {
      FPRINTF (stderr, "Failure parsing file [%s]\n", fname);
      nerrs++;
      goto cleanup;
   }

   node_visit (root, 0, stdout, print_node);

cleanup:
   if (nerrs) {
      node_del (root);
      root = NULL;
   }
   if (nerrors) {
      *nerrors = nerrs;
   }
   free (input);

   return root;
}


/* ********************************************************
 * selector.c module
 * Selector functions.
 *
 * The goal is to locate, in the input HTML, specific data, and extract
 * that data in plain text form.
 *
 * While it would be nice to eventually support any expression supported
 * by `querySelectorAll()` due to existing webdev familiarity, that's a
 * large (and probably painful) undertaking.
 *
 * Selectors  should look like this:
 *    "div > id^='-test' ~ .myClass"
 *
 */

enum selector_type_t {
   selector_END = -2,
   selector_ERROR = -1,
   selector_OPERAND = 1,
   selector_OPERATOR,
   selector_STRING,
};

const char *selector_type_name (enum selector_type_t type)
{
   static const struct {
      enum selector_type_t type;
      const char *name;
   } names[] = {
#define SEL(x) { x, #x }
      SEL (selector_END),
      SEL (selector_ERROR),
      SEL (selector_OPERAND),
      SEL (selector_OPERATOR),
      SEL (selector_STRING),
#undef SEL
   };

   static const size_t nnames = sizeof names/sizeof names[0];
   for (size_t i=0; i<nnames; i++) {
      if (type == names[i].type) {
         return names[i].name;
      }
   }

   static char unknown[55];
   snprintf (unknown, sizeof unknown, "Unknown selector type: %i", type);
   return unknown;
}


struct selector_t {
   /* **********************************************
    * A selector_t element is one of _END, _OPERAND
    * or _OPERATOR
    */
   enum selector_type_t type;
   union {
      /* **********************************************
       * Attribute we want to match. There are some
       * that are reserved for internal use and cannot
       * be used by the HTML we are processing.
       *    1. \x02tagname
       *    2. \x02content
       * In general any string starting with \x02 is
       * reserved for internal use. HTML pages using
       * tags, classnames or attribute names that
       * start with \x02 will have incorrect search
       * results.
       */
      char *_operand;

      /* **********************************************
       * Operators:
       * ,     Logical OR
       * ~     Any sibling
       * +     Adjacent sibling
       * >     Direct child
       * $     Wildcard for end of string
       * ^     Wildcard for beginning of string
       * *     Wildcard for 'anywhere in string'
       * =     Match exact
       * \x03  Descendent
       */
      int _operator;

      /* **********************************************
       * Store strings that would get matched against
       */
      char *_string;
   } u;
};

static void selector_dump (struct selector_t *s, char **dst)
{
   if (!s) {
      FPRINTF (stderr, "Got NULL selector_t object\n");
      return;
   }

   tprintf (dst, "[%s:", selector_type_name (s->type));
   switch (s->type) {
      case selector_END:
      case selector_ERROR:    tprintf (dst, "...");                 break;
      case selector_OPERAND:  tprintf (dst, "%s", s->u._operand);   break;
      case selector_STRING:   tprintf (dst, "%s", s->u._string);    break;
      case selector_OPERATOR: tprintf (dst, "%c", s->u._operator);  break;
   }
   tprintf (dst, "]\n");
}

static void selector_del (struct selector_t *sel)
{
   if (!sel)
      return;

   if (sel->type == selector_OPERAND) {
      free (sel->u._operand);
   }

   if (sel->type == selector_STRING) {
      free (sel->u._string);
   }

   free (sel);
}

static struct selector_t *selector_new (enum selector_type_t type,
                                        int operator,
                                        const char *operand)
{
   struct selector_t *ret = calloc (1, sizeof *ret);
   if (!ret) {
      FPRINTF (stderr, "OOM error allocating selector_t object [%s]\n", operand);
      return NULL;
   }
   ret->type = type;
   switch (ret->type) {

      case selector_OPERATOR:
         ret->u._operator = operator;
         break;

      case selector_STRING:
      case selector_OPERAND:
         if (!(ret->u._string = sstrdup (operand))) {
            FPRINTF (stderr, "OOM error allocating operand field [%s]\n", operand);
            free (ret);
            return NULL;
         }
         break;

      case selector_ERROR:
      case selector_END:
         break;
   }

   return ret;
}

static char *delimited_string (char *src, size_t *index, const char *delims)
{
   char *start = &src[*index];
   char *end = start;
   while (*end && (!(strchr (delims, *end)))) {
      end++;
   }
   *end = 0;
   size_t slen = end - start;
   (*index) += slen;
   return start;
}

#define SOPERATORS      ".#[,~>+$*^="
#define SDELIMS         "'\" \t\n\r" SOPERATORS

static enum selector_type_t _selector_read_attr (list_t *dst,
                                                 const char *attr,
                                                 char *s, size_t *index)
{
   if (!(list_append (dst, selector_new (selector_OPERAND, 0, attr)))
         || !(list_append (dst, selector_new (selector_OPERATOR, '=', NULL)))
         || !(list_append (dst,
               selector_new (selector_STRING, 0,
                  delimited_string (s, index, SDELIMS))))) {
      FPRINTF (stderr, "Failed to read direct attr ref [%s]\n", attr);
      return false;
   }

   return true;
}

static enum selector_type_t selector_read_class (list_t *dst,
                                                 char *s, size_t *index)
{
   // e.g. ".myClassName"
   (*index)++;
   return _selector_read_attr (dst, "class", s, index);
}

static enum selector_type_t selector_read_id (list_t *dst,
                                              char *s, size_t *index)
{
   // e.g. "#myId"
   (*index)++;
   return _selector_read_attr(dst, "id", s, index);
}

static enum selector_type_t selector_read_attrname (list_t *dst,
                                                    char *s, size_t *index)
{
   // e.g. "[someAttr=somevalue]"
   //        ^^^^^^^^           <------ consume and returns only that
   (*index)++;
   return
      _selector_read_attr (dst, delimited_string (s, index, SDELIMS), s, index);
}

static enum selector_type_t selector_read_op (list_t *dst,
                                              char *s, size_t *index)
{
   // e.g. "s1 > s2"
   //          ^   <---- consume and returns only that
   // e.g. "[someAttr ~ someValue]"
   //                 ^        <-------- consume and returns only that
   if (!(list_append(dst, selector_new (selector_OPERATOR, s[*index], NULL)))) {
      FPRINTF (stderr, "Failed to create operator %c\n", s[*index]);
      return false;
   }
   (*index)++;
   return true;
}

static enum selector_type_t selector_read_match (list_t *dst,
                                                 char *s, size_t *index)
{
   // e.g. [someAttr^=someValue]
   //               ^      <-------------- returns only that
   // Consumes ----> ^^
   if (!(list_append(dst, selector_new (selector_OPERATOR, s[*index], NULL)))) {
      FPRINTF (stderr, "Failed to create operator %c\n", s[*index]);
      return false;
   }
   (*index)++;
   return true;
}

static enum selector_type_t selector_read_string (list_t *dst,
                                                  char *s, size_t *index)
{
   char *string = parse_string (s, index);
   if (!(list_append (dst, selector_new (selector_STRING, 0, string)))) {
      FPRINTF (stderr, "Failed to read string into selector_t list\n");
      return false;
   }

   return true;
}

static enum selector_type_t selector_read_tag (list_t *dst,
                                               char *s, size_t *index)
{
   return _selector_read_attr (dst, "\x02tagname", s, index);
}


/* **************************************************************
 * Reads exactly one selector_t object, and inserts it into the
 * provided list.
 */
static enum selector_type_t _selector_read_next (list_t *dst,
                                                 char *sq, size_t *index)
{
   // Swallow whitespace
   while ((is_space (sq[*index]))) {
      (*index)++;
   }

   // Reached end of input?
   if (!(sq[*index])) {
      return list_append (dst, selector_new (selector_END, 0, NULL))
            ? selector_END
            : selector_ERROR;
   }

   enum selector_type_t type = selector_ERROR;;
   // Determine what we are dealing with
   switch (sq[*index]) {
      case '.':   type = selector_read_class (dst, sq, index);    break;
      case '#':   type = selector_read_id (dst, sq, index);       break;
      case '[':   type = selector_read_attrname (dst, sq, index); break;
      case ',':   // Fallthrough
      case '~':   // Fallthrough
      case '>':   // Fallthrough
      case '=':   // Fallthrough
      case '+':   type = selector_read_op (dst, sq, index);       break;
      case '$':   // Fallthrough
      case '*':   // Fallthrough
      case '^':   type = selector_read_match (dst, sq, index);    break;
      case '"':   // Fallthrough
      case '\'':  type = selector_read_string (dst, sq, index);   break;
      default:    type = selector_read_tag (dst, sq, index);      break;
   }

   return type;
}

static list_t *selector_parse (const char *sq)
{
   bool error = true;
   size_t index = 0;
   char *copy = sstrdup (sq);
   if (!copy) {
      FPRINTF (stderr, "Failed to create a copy of the input\n");
      return NULL;
   }

   list_t *ret = list_new ((void (*) (void *))selector_del);
   if (!ret) {
      FPRINTF (stderr, "Failed to create list of selector operations for [%s]\n",
            copy);
      goto cleanup;
   }

   enum selector_type_t type;
   while ((type = _selector_read_next (ret, copy, &index)) > 0) {
      ;
   }

   if (type != selector_END) {
      FPRINTF (stderr, "Encountered errors while processing [%s]\n", copy);
      goto cleanup;
   }

   error = false;
cleanup:
   free (copy);
   if (error) {
      list_free (&ret);
   }
   return ret;
}


/* **************************************************************
 * The main program module that contains `int main()`.
 */
static int process_query (const char *query, const node_t *tree)
{
   return 1;
}

static int process_file (const char *fname, list_t *queries)
{
   int nerrs = 0;
   node_t *tree = NULL;

   if (!(tree = node_read (fname, &nerrs))) {
      FPRINTF (stderr, "Failed to parse input from [%s]\n", fname);
      nerrs++;
      goto cleanup;
   }

   for (size_t i=0; i<queries->nitems; i++) {
      // Parallelisation also easily possible right here. This is
      // a better spot than the loop in `main()`.
      int rc = process_query ((const char *)queries->items[i], tree);
      if (rc) {
         const char *s_err = "error";
         if (rc > 1) {
            s_err = "errors";
         }
         FPRINTF (stderr, "%i %s found while processing [%s:%s]. Ignoring\n",
                  rc, s_err, fname, (const char *)queries->items[i]);
      }
   }


cleanup:
   node_del (tree);

   return nerrs;
}

static void print_helpmsg (void)
{
   static const char *helpmsg[] = {
"NAME",
"  HtmlQuery: a program to perform queries and minimal transformations.",
"     on HTML input.",
"",
"SYNPOSIS",
"  htmlq [[-q <query-string>] ...] [-f <filename>] ...",
"  htmlq [[-q <query-string>] ...]",
"  htmlq -h",
"",
"DESCRIPTION",
"     Zero or more query expressions must be specified with [-q query-string].",
"     When no query expressions are found, nothing is returned. Zero or more",
"     input files can be specified with [-f <filename>]. When no input files",
"     are specified, input is read from stdin.",
"",
"     HtmlQuery returns the number of errors encountered to the caller.",
"",
"OPTIONS",
"  -f <filename>     Path to the  input file. This option can be repeated as",
"                    many times as necessary to process multiple files at the",
"                    same time.",
"  -s <query-string> Specify the query-string to match. This option can be",
"                    repeated to specify multiple query-strings. A match is",
"                    made when *any* query-string matches.",
"  -h                Print this message and exit with a zero exit code.",
"  -v                Print the program version and exit with a zero exit code.",
"",
"BUGS",
"  Very likely. Send bug reports to lee@rundata.co.za, with the title set",
"  to 'bug-report: htmlq' or similar.",
"",
   };
   for (size_t i=0; i<sizeof helpmsg/sizeof helpmsg[i]; i++) {
      printf ("%s\n", helpmsg[i]);
   }
}


int main (int argc, char **argv)
{
   int ret = EXIT_FAILURE;

   list_t *files = list_new (free);
   list_t *queries = list_new (free);

   if (!files || !queries) {
      FPRINTF (stderr, "Failed to allocate arrays [files:queries] [%p:%p]\n",
               files, queries);
      goto cleanup;
   }

   int tokeniser_test (void);
   int selector_test (void);

   // Parse all the options
   int argv_index = 0;
   for (argv_index=1; argv_index<argc && argv[argv_index]; argv_index++) {
      if ((argv[argv_index][0]) != '-') {
         break;
      }
      switch (argv[argv_index][1]) {
         case '1':
            ret = tokeniser_test ();
            goto cleanup;

         case '2':
            ret = selector_test ();
            goto cleanup;

         case 'f':
            if (!(list_append (files, sstrdup (argv[++argv_index])))) {
               FPRINTF (stderr, "OOM error storing option [-f %s]\n",
                        argv[argv_index]);
               goto cleanup;
            }
            break;

         case 'q':
            if (!(list_append (queries, sstrdup (argv[++argv_index])))) {
               FPRINTF (stderr, "OOM error storing query-string [-s %s]\n",
                        argv[argv_index]);
               goto cleanup;
            }
            break;

         case 'h':
            print_helpmsg ();
            ret = EXIT_SUCCESS;
            goto cleanup;

         case 'v':
            printf ("Starting htmlq %s\n", VERSION);
            break;

         default:
            FPRINTF (stderr, "Unrecognised option flag '%s'\n", argv[argv_index]);
            goto cleanup;
      }
   }

   ret = 0;
   for (size_t i=0; i<files->nitems; i++) {
      // Can process this in parallel, but why bother?
      int nerrs = process_file ((const char *)files->items[i], queries);
      if (nerrs) {
         const char *s_err = "error";
         if (nerrs > 1) {
            s_err = "errors";
         }
         FPRINTF (stderr, "%i %s processing file [%s]. Ignoring\n",
                  nerrs, s_err, (const char *)files->items[i]);
         ret += nerrs;
      }
   }

cleanup:
   list_free (&files);
   list_free (&queries);

   return ret;
}

int tokeniser_test (void)
{
   char *input = sstrdup (
      "<!DOCTYPE attr1 attr2=\"value2\">"
      "<html >"
      "  <body attr3='value3'>"
      "  <!-- this is a comment"
      "     -->"
      "     <tag1 attr4='value4'/>"
      "     <tag2 attr5='value5' />"
      "     <tag3 attr6=value6/>"
      "     <tag4 > some text goes here <tag5> more text </tag5> </tag4>"
      "     <tag6 attr7> </tag6>"
      "     <tag7 attr8=value8> </tag7>"
      "     <script type=module src=where.js>"
      "     let tmp = \" </script> \";"
      "     let tmp = '\"</script>\"';"
      "     </script>"
      "     <tag8 attr9=value-10 > </tag8>"
      "     <tag9/>"
      "     <p>"
      "  </body>"
      "  <!--another comment-->"
      "</html >"
      );

   static const char *expected =
      "T token_TAGOPEN[!DOCTYPE]\n"
      "T token_TEXT[attr1]\n"
      "T token_KP[attr2=\"value2\"]\n"
      "T token_GT[>]\n"
      "T token_TAGOPEN[html]\n"
      "T token_GT[>]\n"
      "T token_TAGOPEN[body]\n"
      "T token_KP[attr3='value3']\n"
      "T token_GT[>]\n"
      "T token_TAGOPEN[!--]\n"
      "T token_TEXT[this]\n"
      "T token_TEXT[is]\n"
      "T token_TEXT[a]\n"
      "T token_TEXT[comment]\n"
      "T token_TEXT[--]\n"
      "T token_GT[>]\n"
      "T token_TAGOPEN[tag1]\n"
      "T token_KP[attr4='value4']\n"
      "T token_SELFCLOSING[/>]\n"
      "T token_TAGOPEN[tag2]\n"
      "T token_KP[attr5='value5']\n"
      "T token_SELFCLOSING[/>]\n"
      "T token_TAGOPEN[tag3]\n"
      "T token_TEXT[attr6=value6]\n"
      "T token_SELFCLOSING[/>]\n"
      "T token_TAGOPEN[tag4]\n"
      "T token_GT[>]\n"
      "T token_TEXT[some]\n"
      "T token_TEXT[text]\n"
      "T token_TEXT[goes]\n"
      "T token_TEXT[here]\n"
      "T token_TAGOPEN[tag5]\n"
      "T token_GT[>]\n"
      "T token_TEXT[more]\n"
      "T token_TEXT[text]\n"
      "T token_TAGCLOSE[tag5]\n"
      "T token_TAGCLOSE[tag4]\n"
      "T token_TAGOPEN[tag6]\n"
      "T token_TEXT[attr7]\n"
      "T token_GT[>]\n"
      "T token_TAGCLOSE[tag6]\n"
      "T token_TAGOPEN[tag7]\n"
      "T token_TEXT[attr8=value8]\n"
      "T token_GT[>]\n"
      "T token_TAGCLOSE[tag7]\n"
      "T token_IGNOPEN[script]\n"
      "T token_TEXT[type=module]\n"
      "T token_TEXT[src=where.js]\n"
      "T token_GT[>]\n"
      "T token_TEXT[let]\n"
      "T token_TEXT[tmp]\n"
      "T token_TEXT[=]\n"
      "T token_TEXT[ </script> ]\n"
      "T token_TEXT[;]\n"
      "T token_TEXT[let]\n"
      "T token_TEXT[tmp]\n"
      "T token_TEXT[=]\n"
      "T token_TEXT[\"</script>\"]\n"
      "T token_TEXT[;]\n"
      "T token_IGNCLOSE[script]\n"
      "T token_TAGOPEN[tag8]\n"
      "T token_TEXT[attr9=value-10]\n"
      "T token_GT[>]\n"
      "T token_TAGCLOSE[tag8]\n"
      "T token_TAGOPEN[tag9]\n"
      "T token_SELFCLOSING[/>]\n"
      "T token_TAGOPEN[p]\n"
      "T token_GT[>]\n"
      "T token_TAGCLOSE[body]\n"
      "T token_TAGOPEN[!--another]\n"
      "T token_TEXT[comment--]\n"
      "T token_GT[>]\n"
      "T token_TAGCLOSE[html]\n"
      "Found 73 tokens\n";

   char *output = NULL;
   struct token_t *token = NULL;
   size_t index = 0;

   size_t ntokens = 0;

   while ((token = next_token (input, &index))
         && token->type != token_END) {
      ntokens++;
      tprintf (&output, "T %s[%s]\n", token_type_name (token->type), token->text);
      token_del (token);
   }

   tprintf (&output, "Found %zu tokens\n", ntokens);
   int ret = EXIT_FAILURE;
   if (token && token->type == token_END) {
      FPRINTF (stderr, "TEST: Success\n");
      ret = EXIT_SUCCESS;
   } else {
      FPRINTF (stderr, "TEST: Failure\n");
   }

   token_del (token);
   free (input);
   if ((strcmp (expected, output)) != 0) {
      FPRINTF (stderr, "Unexpected output. Expected:\n%s\nGot:\n%s\n",
            expected, output);
   }
   free (output);

   return ret;
}

int selector_test (void)
{
   int ret = EXIT_FAILURE;
   static const char *tests[] = {
      "one",
      ".two",
      "#three",
      "four > five",
      "#six ~ .seven",
   };

   static const char *expected =
      "[selector_OPERAND:\x02tagname]\n"
      "[selector_OPERATOR:=]\n"
      "[selector_STRING:one]\n"
      "[selector_END:...]\n"
      "[selector_OPERAND:class]\n"
      "[selector_OPERATOR:=]\n"
      "[selector_STRING:two]\n"
      "[selector_END:...]\n"
      "[selector_OPERAND:id]\n"
      "[selector_OPERATOR:=]\n"
      "[selector_STRING:three]\n"
      "[selector_END:...]\n"
      "[selector_OPERAND:\x02tagname]\n"
      "[selector_OPERATOR:=]\n"
      "[selector_STRING:four]\n"
      "[selector_END:...]\n"
      "[selector_OPERAND:id]\n"
      "[selector_OPERATOR:=]\n"
      "[selector_STRING:six]\n"
      "[selector_END:...]\n";

   static const size_t ntests = sizeof tests / sizeof tests[0];

   char *output = NULL;

   list_t *selectors = NULL;
   for (size_t i=0; i<ntests; i++) {
      list_free (&selectors);
      selectors = selector_parse (tests[i]);
      if (!selectors) {
         FPRINTF (stderr, "Failed to create list for selector objects\n");
         goto cleanup;
      }
      for (size_t j=0; j<selectors->nitems; j++) {
         selector_dump (selectors->items[j], &output);
      }
   }

   const char *tmp = output;
   size_t processed = strdiff (&expected, &tmp);
   if (processed) {
      FPRINTF (stderr, "processed before error: %zu characters\n", processed);
      FPRINTF (stderr, "Unexpected output. Expected:\n%s\nGot:\n%s\n",
               expected, tmp);
   }

   ret = EXIT_SUCCESS;
cleanup:
   free (output);
   list_free (&selectors);
   return ret;
}