Created
December 13, 2023 11:52
-
-
Save adamziel/422cb327ad72989477b33dfde75a39d1 to your computer and use it in GitHub Desktop.
WP HTML Tag Processor, but written in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This is a fun exploration I've done of porting the Tag Processor API over to C as a PHP extension. | |
* For now this is just a part of the parse_next_attribute() function. | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdbool.h> | |
#include <ctype.h> | |
#define MAX_BOOKMARKS 10 | |
#define MAX_SEEK_OPS 1000 | |
typedef struct | |
{ | |
char *name; | |
int value_starts_at; | |
int value_length; | |
int start; | |
int end; | |
bool is_true; | |
} WP_HTML_Attribute_Token; | |
typedef struct | |
{ | |
int start; | |
int end; | |
} WP_HTML_Span; | |
bool ADD_CLASS = true; | |
bool REMOVE_CLASS = false; | |
typedef struct | |
{ | |
char *class_name; | |
int op; | |
} WP_Classname_Update; | |
// Define the structure to hold tag information | |
typedef struct | |
{ | |
char *last_query; | |
char *sought_tag_name; | |
char *sought_class_name; | |
int sought_match_offset; | |
bool stop_on_tag_closers; | |
int bytes_already_parsed; | |
int token_starts_at; | |
int token_length; | |
int tag_name_starts_at; | |
int tag_name_length; | |
bool is_closing_tag; | |
// @TODO: Switch to linkedlists here | |
WP_HTML_Attribute_Token *attributes; | |
WP_HTML_Span *duplicate_attributes; | |
WP_HTML_Span *bookmarks; | |
char *html; | |
} HTML_Tag_Processor_State; | |
// Function prototypes | |
void HTML_Tag_Processor_init(HTML_Tag_Processor_State *self, const char *html); | |
bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State *self, const char *query); | |
void skip_whitespace(HTML_Tag_Processor_State *self); | |
char *substr(const char *source, int length); | |
char *strtolower(const char *str); | |
bool attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self); | |
bool duplicate_attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self); | |
void HTML_Tag_Processor_init(HTML_Tag_Processor_State *self, const char *html) | |
{ | |
self->html = strdup(html); // Use strdup to duplicate the string | |
self->bytes_already_parsed = 0; | |
// Initialize other members as necessary | |
} | |
bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State *self, const char *query) | |
{ | |
// Implement the logic of parsing the next tag here | |
// This will involve parsing the HTML string, which is stored in self->html | |
// For now, let's just return false to indicate we haven't implemented this | |
return false; | |
} | |
// You should also define and implement other methods as needed | |
bool parse_next_attribute(HTML_Tag_Processor_State *self) | |
{ | |
// Skip whitespace and slashes. | |
self->bytes_already_parsed += strspn(self->html, " \t\f\r\n/"); | |
if (self->bytes_already_parsed >= strlen(self->html)) | |
{ | |
return false; | |
} | |
/* | |
* Treat the equal sign as a part of the attribute | |
* name if it is the first encountered byte. | |
* | |
* @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state | |
*/ | |
int name_length; | |
if (self->html[self->bytes_already_parsed] == '=') | |
{ | |
name_length = 1 + strcspn(self->html + self->bytes_already_parsed + 1, "=/> \t\f\r\n"); | |
} | |
else | |
{ | |
name_length = strcspn(self->html + self->bytes_already_parsed, "=/> \t\f\r\n"); | |
} | |
// No attribute, just tag closer. | |
if (0 == name_length || self->bytes_already_parsed + name_length >= strlen(self->html)) | |
{ | |
printf("bytes_already_parsed %d.\n", self->bytes_already_parsed); | |
printf("No attribute, just tag closer %s.\n", substr(self->html, 10)); | |
return false; | |
} | |
int attribute_start = self->bytes_already_parsed; | |
char *attribute_name = substr(self->html + attribute_start, name_length); | |
self->bytes_already_parsed += name_length; | |
if (self->bytes_already_parsed >= strlen(self->html)) | |
{ | |
return false; | |
} | |
skip_whitespace(self); | |
if (self->bytes_already_parsed >= strlen(self->html)) | |
{ | |
return false; | |
} | |
bool has_value = '=' == self->html[self->bytes_already_parsed]; | |
char quote; | |
int value_start; | |
int value_length; | |
int attribute_end; | |
if (has_value) | |
{ | |
++self->bytes_already_parsed; | |
skip_whitespace(self); | |
if (self->bytes_already_parsed >= strlen(self->html)) | |
{ | |
return false; | |
} | |
if (self->html[self->bytes_already_parsed] == '"' || self->html[self->bytes_already_parsed] == '\'') | |
{ | |
// quote = self->html[self->bytes_already_parsed]; | |
value_start = self->bytes_already_parsed + 1; | |
value_length = strcspn(self->html + value_start, "\""); | |
attribute_end = value_start + value_length + 1; | |
self->bytes_already_parsed = attribute_end; | |
} | |
else | |
{ | |
value_start = self->bytes_already_parsed; | |
value_length = strcspn(self->html + value_start, "> \t\f\r\n"); | |
attribute_end = value_start + value_length; | |
self->bytes_already_parsed = attribute_end; | |
} | |
} | |
else | |
{ | |
value_start = self->bytes_already_parsed; | |
value_length = 0; | |
attribute_end = attribute_start + name_length; | |
} | |
if (attribute_end >= strlen(self->html)) | |
{ | |
return false; | |
} | |
if (self->is_closing_tag) | |
{ | |
return true; | |
} | |
/* | |
* > There must never be two or more attributes on | |
* > the same start tag whose names are an ASCII | |
* > case-insensitive match for each other. | |
* - HTML 5 spec | |
* | |
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive | |
*/ | |
char *comparable_name = strtolower(attribute_name); | |
// If an attribute is listed many times, only use the first declaration and ignore the rest. | |
if (!attribute_exists(comparable_name, self)) | |
{ | |
WP_HTML_Attribute_Token *token; | |
malloc(sizeof(WP_HTML_Attribute_Token)); | |
token->name = strdup(attribute_name); | |
token->start = attribute_start; | |
token->end = attribute_end; | |
token->value_starts_at = value_start; | |
token->value_length = value_length; | |
token->is_true = !has_value; | |
if (NULL == self->attributes) | |
{ | |
self->attributes = token; | |
} | |
else | |
{ | |
// WP_HTML_Attribute_Token *new_attributes = malloc(sizeof(self->attributes) + sizeof(token)); | |
// memcpy(new_attributes, self->attributes, sizeof(self->attributes)); | |
// new_attributes[sizeof(self->attributes)] = token; | |
// self->attributes = new_attributes; | |
} | |
return true; | |
} | |
/* | |
* Track the duplicate attributes so if we remove it, all disappear together. | |
* | |
* While `self->duplicated_attributes` could always be stored as an `array()`, | |
* which would simplify the logic here, storing a `null` and only allocating | |
* an array when encountering duplicates avoids needless allocations in the | |
* normative case of parsing tags with no duplicate attributes. | |
*/ | |
WP_HTML_Span *duplicate_span; | |
malloc(sizeof(WP_HTML_Span)); | |
duplicate_span->start = attribute_start; | |
duplicate_span->end = attribute_end; | |
if (NULL == self->duplicate_attributes) | |
{ | |
self->duplicate_attributes = duplicate_span; | |
} | |
else if (!duplicate_attribute_exists(comparable_name, self)) | |
{ | |
// WP_HTML_Span *new_duplicate_attributes = malloc(sizeof(self->duplicate_attributes) + sizeof(duplicate_span)); | |
// memcpy(new_duplicate_attributes, self->duplicate_attributes, sizeof(self->duplicate_attributes)); | |
// new_duplicate_attributes[sizeof(self->duplicate_attributes)] = duplicate_span; | |
// self->duplicate_attributes = new_duplicate_attributes; | |
} | |
else | |
{ | |
} | |
return true; | |
} | |
bool attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self) | |
{ | |
if (self->attributes == NULL) | |
{ | |
return false; | |
} | |
for (int i = 0; i < sizeof(self->attributes); i++) | |
{ | |
if (strcmp(self->attributes[i].name, comparable_name) == 0) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
bool duplicate_attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self) | |
{ | |
if (self->duplicate_attributes == NULL) | |
{ | |
return false; | |
} | |
char *duplicate_name; | |
for (int i = 0; i < sizeof(self->duplicate_attributes); i++) | |
{ | |
duplicate_name = substr(self->html + self->duplicate_attributes[i].start, self->duplicate_attributes[i].end - self->duplicate_attributes[i].start); | |
if (strcmp(duplicate_name, comparable_name) == 0) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
char *substr(const char *source, int length) | |
{ | |
char *result = malloc(length + 1); | |
for (int i = 0; i < length; i++) | |
{ | |
result[i] = source[i]; | |
} | |
result[length] = '\0'; | |
return result; | |
} | |
char *strtolower(const char *str) | |
{ | |
size_t length = strlen(str); | |
char *result = malloc(length + 1); | |
for (size_t i = 0; i < length; i++) | |
{ | |
result[i] = tolower(str[i]); | |
} | |
result[length] = '\0'; | |
return result; | |
} | |
void skip_whitespace(HTML_Tag_Processor_State *self) | |
{ | |
self->bytes_already_parsed += strspn(self->html + self->bytes_already_parsed, " \t\f\r\n"); | |
} | |
int main() | |
{ | |
HTML_Tag_Processor_State *processor = malloc(sizeof(HTML_Tag_Processor_State)); | |
HTML_Tag_Processor_init(processor, "class=\"foo bar car\" id=\"baz\""); | |
bool parsed = parse_next_attribute(processor); | |
printf("parse_next_attribute found an attribute: %s \n", (parsed ? "YES" : "NO")); | |
printf("html: %s \n", processor->html); | |
if (processor->attributes != NULL && sizeof(processor->attributes) > 0) | |
{ | |
printf("Attribute name: %s %d %d value=%s\n", | |
processor->attributes[0].name, | |
processor->attributes[0].value_starts_at, | |
processor->attributes[0].value_length, | |
substr(processor->html + processor->attributes[0].value_starts_at, processor->attributes[0].value_length)); | |
} | |
// Clean up | |
// Free other dynamically allocated memory as needed | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment