Skip to content

Instantly share code, notes, and snippets.

@adamziel
Created December 13, 2023 11:52
Show Gist options
  • Save adamziel/422cb327ad72989477b33dfde75a39d1 to your computer and use it in GitHub Desktop.
Save adamziel/422cb327ad72989477b33dfde75a39d1 to your computer and use it in GitHub Desktop.
WP HTML Tag Processor, but written in C
/*
* This is a fun exploration I've done of porting the Tag Processor API over to C as a PHP extension.
* For now this is just a part of the parse_next_attribute() function.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
#define MAX_BOOKMARKS 10
#define MAX_SEEK_OPS 1000
typedef struct
{
char *name;
int value_starts_at;
int value_length;
int start;
int end;
bool is_true;
} WP_HTML_Attribute_Token;
typedef struct
{
int start;
int end;
} WP_HTML_Span;
bool ADD_CLASS = true;
bool REMOVE_CLASS = false;
typedef struct
{
char *class_name;
int op;
} WP_Classname_Update;
// Define the structure to hold tag information
typedef struct
{
char *last_query;
char *sought_tag_name;
char *sought_class_name;
int sought_match_offset;
bool stop_on_tag_closers;
int bytes_already_parsed;
int token_starts_at;
int token_length;
int tag_name_starts_at;
int tag_name_length;
bool is_closing_tag;
// @TODO: Switch to linkedlists here
WP_HTML_Attribute_Token *attributes;
WP_HTML_Span *duplicate_attributes;
WP_HTML_Span *bookmarks;
char *html;
} HTML_Tag_Processor_State;
// Function prototypes
void HTML_Tag_Processor_init(HTML_Tag_Processor_State *self, const char *html);
bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State *self, const char *query);
void skip_whitespace(HTML_Tag_Processor_State *self);
char *substr(const char *source, int length);
char *strtolower(const char *str);
bool attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self);
bool duplicate_attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self);
void HTML_Tag_Processor_init(HTML_Tag_Processor_State *self, const char *html)
{
self->html = strdup(html); // Use strdup to duplicate the string
self->bytes_already_parsed = 0;
// Initialize other members as necessary
}
bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State *self, const char *query)
{
// Implement the logic of parsing the next tag here
// This will involve parsing the HTML string, which is stored in self->html
// For now, let's just return false to indicate we haven't implemented this
return false;
}
// You should also define and implement other methods as needed
bool parse_next_attribute(HTML_Tag_Processor_State *self)
{
// Skip whitespace and slashes.
self->bytes_already_parsed += strspn(self->html, " \t\f\r\n/");
if (self->bytes_already_parsed >= strlen(self->html))
{
return false;
}
/*
* Treat the equal sign as a part of the attribute
* name if it is the first encountered byte.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
*/
int name_length;
if (self->html[self->bytes_already_parsed] == '=')
{
name_length = 1 + strcspn(self->html + self->bytes_already_parsed + 1, "=/> \t\f\r\n");
}
else
{
name_length = strcspn(self->html + self->bytes_already_parsed, "=/> \t\f\r\n");
}
// No attribute, just tag closer.
if (0 == name_length || self->bytes_already_parsed + name_length >= strlen(self->html))
{
printf("bytes_already_parsed %d.\n", self->bytes_already_parsed);
printf("No attribute, just tag closer %s.\n", substr(self->html, 10));
return false;
}
int attribute_start = self->bytes_already_parsed;
char *attribute_name = substr(self->html + attribute_start, name_length);
self->bytes_already_parsed += name_length;
if (self->bytes_already_parsed >= strlen(self->html))
{
return false;
}
skip_whitespace(self);
if (self->bytes_already_parsed >= strlen(self->html))
{
return false;
}
bool has_value = '=' == self->html[self->bytes_already_parsed];
char quote;
int value_start;
int value_length;
int attribute_end;
if (has_value)
{
++self->bytes_already_parsed;
skip_whitespace(self);
if (self->bytes_already_parsed >= strlen(self->html))
{
return false;
}
if (self->html[self->bytes_already_parsed] == '"' || self->html[self->bytes_already_parsed] == '\'')
{
// quote = self->html[self->bytes_already_parsed];
value_start = self->bytes_already_parsed + 1;
value_length = strcspn(self->html + value_start, "\"");
attribute_end = value_start + value_length + 1;
self->bytes_already_parsed = attribute_end;
}
else
{
value_start = self->bytes_already_parsed;
value_length = strcspn(self->html + value_start, "> \t\f\r\n");
attribute_end = value_start + value_length;
self->bytes_already_parsed = attribute_end;
}
}
else
{
value_start = self->bytes_already_parsed;
value_length = 0;
attribute_end = attribute_start + name_length;
}
if (attribute_end >= strlen(self->html))
{
return false;
}
if (self->is_closing_tag)
{
return true;
}
/*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*/
char *comparable_name = strtolower(attribute_name);
// If an attribute is listed many times, only use the first declaration and ignore the rest.
if (!attribute_exists(comparable_name, self))
{
WP_HTML_Attribute_Token *token;
malloc(sizeof(WP_HTML_Attribute_Token));
token->name = strdup(attribute_name);
token->start = attribute_start;
token->end = attribute_end;
token->value_starts_at = value_start;
token->value_length = value_length;
token->is_true = !has_value;
if (NULL == self->attributes)
{
self->attributes = token;
}
else
{
// WP_HTML_Attribute_Token *new_attributes = malloc(sizeof(self->attributes) + sizeof(token));
// memcpy(new_attributes, self->attributes, sizeof(self->attributes));
// new_attributes[sizeof(self->attributes)] = token;
// self->attributes = new_attributes;
}
return true;
}
/*
* Track the duplicate attributes so if we remove it, all disappear together.
*
* While `self->duplicated_attributes` could always be stored as an `array()`,
* which would simplify the logic here, storing a `null` and only allocating
* an array when encountering duplicates avoids needless allocations in the
* normative case of parsing tags with no duplicate attributes.
*/
WP_HTML_Span *duplicate_span;
malloc(sizeof(WP_HTML_Span));
duplicate_span->start = attribute_start;
duplicate_span->end = attribute_end;
if (NULL == self->duplicate_attributes)
{
self->duplicate_attributes = duplicate_span;
}
else if (!duplicate_attribute_exists(comparable_name, self))
{
// WP_HTML_Span *new_duplicate_attributes = malloc(sizeof(self->duplicate_attributes) + sizeof(duplicate_span));
// memcpy(new_duplicate_attributes, self->duplicate_attributes, sizeof(self->duplicate_attributes));
// new_duplicate_attributes[sizeof(self->duplicate_attributes)] = duplicate_span;
// self->duplicate_attributes = new_duplicate_attributes;
}
else
{
}
return true;
}
bool attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self)
{
if (self->attributes == NULL)
{
return false;
}
for (int i = 0; i < sizeof(self->attributes); i++)
{
if (strcmp(self->attributes[i].name, comparable_name) == 0)
{
return true;
}
}
return false;
}
bool duplicate_attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self)
{
if (self->duplicate_attributes == NULL)
{
return false;
}
char *duplicate_name;
for (int i = 0; i < sizeof(self->duplicate_attributes); i++)
{
duplicate_name = substr(self->html + self->duplicate_attributes[i].start, self->duplicate_attributes[i].end - self->duplicate_attributes[i].start);
if (strcmp(duplicate_name, comparable_name) == 0)
{
return true;
}
}
return false;
}
char *substr(const char *source, int length)
{
char *result = malloc(length + 1);
for (int i = 0; i < length; i++)
{
result[i] = source[i];
}
result[length] = '\0';
return result;
}
char *strtolower(const char *str)
{
size_t length = strlen(str);
char *result = malloc(length + 1);
for (size_t i = 0; i < length; i++)
{
result[i] = tolower(str[i]);
}
result[length] = '\0';
return result;
}
void skip_whitespace(HTML_Tag_Processor_State *self)
{
self->bytes_already_parsed += strspn(self->html + self->bytes_already_parsed, " \t\f\r\n");
}
int main()
{
HTML_Tag_Processor_State *processor = malloc(sizeof(HTML_Tag_Processor_State));
HTML_Tag_Processor_init(processor, "class=\"foo bar car\" id=\"baz\"");
bool parsed = parse_next_attribute(processor);
printf("parse_next_attribute found an attribute: %s \n", (parsed ? "YES" : "NO"));
printf("html: %s \n", processor->html);
if (processor->attributes != NULL && sizeof(processor->attributes) > 0)
{
printf("Attribute name: %s %d %d value=%s\n",
processor->attributes[0].name,
processor->attributes[0].value_starts_at,
processor->attributes[0].value_length,
substr(processor->html + processor->attributes[0].value_starts_at, processor->attributes[0].value_length));
}
// Clean up
// Free other dynamically allocated memory as needed
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment