Skip to content

Instantly share code, notes, and snippets.

@dmsnell
Created December 13, 2023 12:24
Show Gist options
  • Save dmsnell/11db00126a1c039cc5cb4f34f0d83ef4 to your computer and use it in GitHub Desktop.
Save dmsnell/11db00126a1c039cc5cb4f34f0d83ef4 to your computer and use it in GitHub Desktop.
Early exploration of a C-based HTML Tag Processor
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tag-processor.h"
struct tp_state tp_new(struct tp_slice html) {
struct tp_state s = {
.html = html.string,
.html_length = html.length
};
return s;
}
bool tp_next_tag(struct tp_state *s) {
TP_FLAG_UNSET(s, tp_at_match);
while (s->bytes_already_parsed < s->html_length) {
if (false == tp_parse_next_tag(s)) {
s->bytes_already_parsed = s->html_length;
return false;
}
while (tp_parse_next_attribute(s)) {
continue;
}
offset_t tag_ends_at = tp_strpos(s, '>', s->bytes_already_parsed);
if (!s->did_succeed) {
return false;
}
s->tag_ends_at = tag_ends_at;
s->bytes_already_parsed = tag_ends_at;
if (tp_matches(s)) {
return true;
}
if (!TP_FLAG(s, tp_is_closer)) {
bool did_succeed = true;
if (s->tag_name == tp_tag_script) {
did_succeed = tp_skip_script_data(s);
} else if (s->tag_name == tp_tag_textarea) {
did_succeed = tp_skip_rcdata(s, tp_tag_textarea);
} else if (s->tag_name == tp_tag_title) {
did_succeed = tp_skip_rcdata(s, tp_tag_title);
}
if (!did_succeed) {
s->bytes_already_parsed = s->html_length;
return false;
}
}
}
return false;
}
struct tp_slice tp_get_attribute(struct tp_state *s, struct tp_slice tag_name) {
s->did_succeed = false;
if (!TP_FLAG_SET(s, tp_at_match) || 0 == tag_name.length) {
return tp_empty_slice();
}
for (size_t i = 0; i < s->attribute_count; i++) {
struct tp_attribute *a = &s->attributes[i];
if (a->name.length != tag_name.length) {
continue;
}
bool full_match = true;
for (offset_t n = 0; n < tag_name.length; n++) {
if (toupper(s->html[a->name.start + n]) != toupper(tag_name.string[n])) {
full_match = false;
break;
}
}
if (full_match) {
struct tp_slice value = {
.string = s->html,
.start = a->value.start,
.length = a->value.length
};
s->did_succeed = true;
return value;
}
}
return tp_empty_slice();
}
bool tp_parse_next_tag(struct tp_state *s) {
tp_after_tag(s);
offset_t at = s->bytes_already_parsed;
while (at < s->html_length) {
at = tp_strpos(s, '<', at);
if (!s->did_succeed) {
return false;
}
if (at < s->html_length && '/' == s->html[at + 1]) {
TP_FLAG_SET(s, tp_is_closer);
at++;
} else {
TP_FLAG_UNSET(s, tp_is_closer);
}
offset_t tag_name_prefix_length = tp_strspn_alpha(s, at + 1);
if (s->did_succeed && tag_name_prefix_length > 0) {
at++;
s->tag_name_length = tag_name_prefix_length + tp_strspn_no_tag_name_ender(s, at + tag_name_prefix_length);
s->tag_name_starts_at = at;
s->bytes_already_parsed = at + s->tag_name_length;
return true;
}
if (at + 1 >= s->html_length) {
return false;
}
if ('!' == s->html[at + 1]) {
if (
s->html_length > at + 3 &&
'-' == s->html[at + 2] &&
'-' == s->html[at + 3]
) {
offset_t closer_at = tp_strposs(s, "-->", 3, at + 4);
if (!s->did_succeed) {
return false;
}
at = closer_at + 3;
continue;
}
if (
s->html_length > at + 8 &&
'[' == s->html[at + 2] &&
'C' == s->html[at + 3] &&
'D' == s->html[at + 4] &&
'A' == s->html[at + 5] &&
'T' == s->html[at + 6] &&
'A' == s->html[at + 7] &&
'[' == s->html[at + 8]
) {
offset_t closer_at = tp_strposs(s, "]]>", 3, at + 9);
if (!s->did_succeed) {
return false;
}
at = closer_at + 3;
continue;
}
if (
s->html_length > at + 8 &&
'D' == (s->html[at + 2] & ~LOWERCASE_BIT) &&
'O' == (s->html[at + 3] & ~LOWERCASE_BIT) &&
'C' == (s->html[at + 4] & ~LOWERCASE_BIT) &&
'T' == (s->html[at + 5] & ~LOWERCASE_BIT) &&
'Y' == (s->html[at + 6] & ~LOWERCASE_BIT) &&
'P' == (s->html[at + 7] & ~LOWERCASE_BIT) &&
'E' == (s->html[at + 8] & ~LOWERCASE_BIT)
) {
offset_t closer_at = tp_strpos(s, '>', at + 9);
if (!s->did_succeed) {
return false;
}
at = closer_at + 1;
continue;
}
at = tp_strpos(s, '>', at + 1);
if (!s->did_succeed) {
return false;
}
continue;
}
if ('?' == s->html[at + 1]) {
offset_t closer_at = tp_strpos(s, '>', at + 2);
if (!s->did_succeed) {
return false;
}
at = closer_at + 1;
continue;
}
at++;
}
return false;
}
bool tp_parse_next_attribute(struct tp_state *s) {
s->bytes_already_parsed += tp_strspn_ws_slash(s, s->bytes_already_parsed);
if (s->bytes_already_parsed >= s->html_length) {
return false;
}
offset_t name_length = '=' == s->html[s->bytes_already_parsed]
? (1 + tp_strspn_no_attribute_name_ender(s, s->bytes_already_parsed + 1))
: tp_strspn_no_attribute_name_ender(s, s->bytes_already_parsed);
if ( 0 == name_length || ((s->bytes_already_parsed + name_length) >= s->html_length)) {
return false;
}
offset_t attribute_start = s->bytes_already_parsed;
s->bytes_already_parsed += name_length;
if (s->bytes_already_parsed >= s->html_length) {
return false;
}
tp_skip_ws(s);
if (s->bytes_already_parsed >= s->html_length) {
return false;
}
bool has_value = '=' == s->html[s->bytes_already_parsed];
if (s->attribute_count > 255) {
return false;
}
struct tp_attribute *attr = &s->attributes[s->attribute_count++];
attr->span.start = attribute_start;
attr->name.start = attribute_start;
attr->name.length = name_length;
if (has_value) {
TP_FLAG_UNSET(attr, tp_is_boolean);
s->bytes_already_parsed++;
tp_skip_ws(s);
if (s->bytes_already_parsed >= s->html_length) {
return false;
}
switch (s->html[s->bytes_already_parsed]) {
case '"':
case '\'': {
char quote = s->html[s->bytes_already_parsed];
attr->value.start = s->bytes_already_parsed + 1;
offset_t quote_end = tp_strpos(s, quote, attr->value.start);
if (!s->did_succeed) {
return false;
}
attr->span.length = quote_end + 1 - attr->value.start;
attr->value.length = quote_end - attr->value.start;
s->bytes_already_parsed = quote_end + 1;
break;
}
default:
attr->value.start = s->bytes_already_parsed;
attr->value.length = tp_strspn_unquoted_value(s, attr->value.start);
if (!s->did_succeed) {
return false;
}
attr->span.length = (attr->value.start - attr->span.start) + attr->value.length;
s->bytes_already_parsed = attr->span.start + attr->span.length;
}
} else {
TP_FLAG_SET(attr, tp_is_boolean);
attr->value.start = s->bytes_already_parsed;
attr->value.length = 0;
attr->span.length = name_length;
}
if (attr->span.start + attr->span.length >= s->html_length) {
return false;
}
return true;
}
bool tp_matches(struct tp_state *s) {
TP_FLAG_SET(s, tp_at_match);
return true;
}
bool tp_skip_script_data(struct tp_state *s) {
return false;
}
bool tp_skip_rcdata(struct tp_state *s, enum tp_tag_name tag) {
return false;
}
void tp_skip_ws(struct tp_state *s) {
while (s->bytes_already_parsed < s->html_length) {
char c = s->html[s->bytes_already_parsed];
if (' ' == c || '\t' == c || '\f' == c || '\r' == c || '\n' == c) {
s->bytes_already_parsed++;
} else {
return;
}
}
}
void tp_after_tag(struct tp_state *s) {
TP_FLAG_UNSET(s, tp_at_match);
s->attribute_count = 0;
memset(s->attributes, 0, 256 * sizeof(struct tp_attribute));
}
//////////////////////
// UTILITY
//////////////////////
struct tp_slice tp_empty_slice() {
struct tp_slice new_slice = {
.string = NULL,
.start = 0,
.length = 0
};
return new_slice;
}
struct tp_slice tp_make_slice(char *s) {
struct tp_slice new_slice = {
.string = s,
.start = 0,
.length = strlen(s)
};
return new_slice;
}
offset_t tp_strpos(struct tp_state *s, char c, offset_t starting_at) {
s->did_succeed = false;
for (offset_t at = starting_at; at < s->html_length; at++) {
if (s->html[at] == c) {
s->did_succeed = true;
return at;
}
}
return 0;
}
offset_t tp_strposs(struct tp_state *s, char *needle, offset_t needle_length, offset_t starting_at) {
char last = needle[needle_length - 1];
offset_t at = starting_at + needle_length;
while (at < s->html_length) {
offset_t next = tp_strpos(s, last, at);
if (!s->did_succeed) {
return 0;
}
for (offset_t i = 1; i < needle_length; i++) {
if (s->html[next - i] != needle[needle_length - i]) {
next++;
break;
}
}
s->did_succeed = true;
return next - needle_length;
}
s->did_succeed = false;
return 0;
}
offset_t tp_strspn_alpha(struct tp_state *s, offset_t starting_at) {
s->did_succeed = true;
for (offset_t at = starting_at; at < s->html_length; at++) {
char c = s->html[at];
if (!((0x41 <= c && c <= 0x5a) || (0x61 <= c && c <= 0x7a))) {
return at - starting_at;
}
}
return s->html_length - starting_at;
}
offset_t tp_strspn_unquoted_value(struct tp_state *s, offset_t starting_at) {
s->did_succeed = true;
for (offset_t at = starting_at; at < s->html_length; at++) {
char c = s->html[at];
if (c == '>' || c == ' ' || c == '\t' || c == '\f' || c == '\r' || c == '\n') {
return at - starting_at;
}
}
return s->html_length - starting_at;
}
offset_t tp_strspn_no_ws(struct tp_state *s, offset_t starting_at) {
s->did_succeed = true;
for (offset_t at = starting_at; at < s->html_length; at++) {
char c = s->html[at];
if (c == ' ' || c == '\t' || c == '\f' || c == '\r' || c == '\n') {
printf("No whitespace: at = %d, %d chars\n", starting_at, at - starting_at);
return at - starting_at;
}
}
return s->html_length - starting_at;
}
offset_t tp_strspn_ws_slash(struct tp_state *s, offset_t starting_at) {
s->did_succeed = true;
for (offset_t at = starting_at; at < s->html_length; at++) {
char c = s->html[at];
if (!(c == ' ' || c == '\t' || c == '\f' || c == '\r' || c == '\n' || c == '/')) {
return at - starting_at;
}
}
return s->html_length - starting_at;
}
offset_t tp_strspn_no_tag_name_ender(struct tp_state *s, offset_t starting_at) {
s->did_succeed = true;
for (offset_t at = starting_at; at < s->html_length; at++) {
char c = s->html[at];
if (c == '/' || c == '>' || c == ' ' || c == '\t' || c == '\f' || c == '\r' || c == '\n') {
return at - starting_at;
}
}
return s->html_length - starting_at;
}
offset_t tp_strspn_no_attribute_name_ender(struct tp_state *s, offset_t starting_at) {
s->did_succeed = true;
for (offset_t at = starting_at; at < s->html_length; at++) {
char c = s->html[at];
if (c == '=' || c == '/' || c == '>' || c == ' ' || c == '\t' || c == '\f' || c == '\r' || c == '\n') {
return at - starting_at;
}
}
return s->html_length - starting_at;
}
//////////////////////
// DEBUG
//////////////////////
void tp_dbg(struct tp_state *s) {
fwrite(s->html, sizeof(char), s->html_length, stdout);
printf("\n");
if (0 == s->tag_name_starts_at) {
return;
}
printf(" <");
tp_print_substr(s, s->tag_name_starts_at, s->tag_name_length);
printf(" (%ld)>\n", s->attribute_count);
for (size_t i = 0; i < s->attribute_count; i++) {
struct tp_attribute *attr = &s->attributes[i];
printf(" - ");
tp_print_substr(s, attr->name.start, attr->name.length);
printf(" : ");
if (TP_FLAG(attr, tp_is_boolean)) {
printf("true");
} else {
tp_print_substr(s, attr->value.start, attr->value.length);
}
printf("\n");
}
}
void tp_print_substr(struct tp_state *s, offset_t at, offset_t length) {
offset_t count = ((at + length) >= s->html_length)
? (s->html_length - at)
: length;
fwrite(s->html + at, sizeof(char), count, stdout);
}
int main(void) {
// printf("0 1 2 3 4 5\n");
// printf(" 1234 6789 1234 6789 1234 6789 1234 6789 1234 6789 1234 6789\n");
// char *html = "<div class=\"wp-group-block\" id='top'><img src=atat.png lazy srcset=\"2x,atat-512.png\"></div>";
// struct tp_state *s = tp_new(tp_make_slice(html));
struct tp_slice html = tp_empty_slice();
FILE *html_spec = fopen("/Users/dmsnell/Downloads/single-page.html", "r");
fseek(html_spec, 0, SEEK_END);
html.length = ftell(html_spec);
rewind(html_spec);
html.string = malloc(html.length);
fread(html.string, html.length, sizeof(char), html_spec);
fclose(html_spec);
struct tp_state s = tp_new(html);
// tp_dbg(s);
long tag_count = 0;
while (tp_next_tag(&s)) {
tag_count++;
}
printf("Found %ld tags\n", tag_count);
// tp_next_tag(s);
// tp_dbg(s);
// printf("TP at %d: '", s->tag_name_starts_at);
// tp_print_substr(s, s->tag_name_starts_at, s->tag_name_length);
// printf("'\n");
// tp_next_tag(s);
// tp_dbg(s);
// printf("TP at %d: '", s->tag_name_starts_at);
// tp_print_substr(s, s->tag_name_starts_at, s->tag_name_length);
// printf("'\n");
// struct tp_slice src = tp_get_attribute(s, tp_make_slice("SRC"));
// if (s->did_succeed) {
// printf(" 'src' = \"");
// fwrite(&src.string[src.start], sizeof(char), src.length, stdout);
// printf("\"\n");
// } else {
// printf(" 'src' not found\n");
// }
// struct tp_slice srcer = tp_get_attribute(s, tp_make_slice("srcer"));
// if (s->did_succeed) {
// printf(" 'srcer' = \"");
// fwrite(&srcer.string[src.start], sizeof(char), srcer.length, stdout);
// printf("\"\n");
// } else {
// printf(" 'srcer' not found\n");
// }
return 0;
}
#include <stdbool.h>
#ifndef TAG_PROCESSOR_H
#define TAG_PROCESSOR_H
typedef uint32_t offset_t;
enum tp_flags {
tp_at_match = 1,
tp_is_closer = 1 << 1
};
enum tp_attribute_flags {
tp_is_boolean = 1
};
enum tp_tag_name {
tp_tag_custom = 0,
tp_tag_a,
tp_tag_blockquote,
tp_tag_div,
tp_tag_img,
tp_tag_p,
tp_tag_q,
tp_tag_script,
tp_tag_textarea,
tp_tag_title
};
#define TP_FLAG(s, f) (0 != (s->flags & f))
#define TP_FLAG_SET(s, f) (s->flags |= f)
#define TP_FLAG_UNSET(s, f) (s->flags &= ~f)
#define LOWERCASE_BIT (0x20)
struct tp_slice {
char *string;
offset_t start;
offset_t length;
};
struct tp_span {
offset_t start;
offset_t length;
};
struct tp_attribute {
struct tp_span span;
struct tp_span name;
struct tp_span value;
uint8_t flags;
};
struct tp_state {
char *html;
offset_t html_length;
char *output_buffer;
offset_t bytes_already_parsed;
offset_t bytes_already_copied;
uint8_t flags;
offset_t tag_name_starts_at;
offset_t tag_name_length;
offset_t tag_ends_at;
enum tp_tag_name tag_name;
char *custom_tag_name;
size_t attribute_count;
struct tp_attribute attributes[256];
bool did_succeed;
};
struct tp_state tp_new(struct tp_slice);
bool tp_next_tag(struct tp_state *);
struct tp_slice tp_get_attribute(struct tp_state *, struct tp_slice);
bool tp_parse_next_tag(struct tp_state *);
bool tp_parse_next_attribute(struct tp_state *);
bool tp_matches(struct tp_state *);
bool tp_skip_script_data(struct tp_state *);
bool tp_skip_rcdata(struct tp_state *, enum tp_tag_name);
void tp_skip_ws(struct tp_state *);
void tp_after_tag(struct tp_state *);
void tp_dbg(struct tp_state *);
void tp_print_substr(struct tp_state *, offset_t at, offset_t length);
struct tp_slice tp_empty_slice();
struct tp_slice tp_make_slice(char *);
offset_t tp_strpos(struct tp_state *, char c, offset_t offset);
offset_t tp_strposs(struct tp_state *, char *c, offset_t length, offset_t offset);
offset_t tp_strspn_alpha(struct tp_state *s, offset_t starting_at);
offset_t tp_strspn_unquoted_value(struct tp_state *s, offset_t starting_at);
offset_t tp_strspn_no_ws(struct tp_state *s, offset_t starting_at);
offset_t tp_strspn_ws_slash(struct tp_state *s, offset_t starting_at);
offset_t tp_strspn_no_tag_name_ender(struct tp_state *s, offset_t starting_at);
offset_t tp_strspn_no_attribute_name_ender(struct tp_state *s, offset_t starting_at);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment