adamziel/main.c

## main.c
/*
 * This is a fun exploration I've done of porting the Tag Processor API over to C as a PHP extension.
 * For now this is just a part of the parse_next_attribute() function.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>

#define MAX_BOOKMARKS 10
#define MAX_SEEK_OPS 1000

typedef struct
{
    char *name;
    int value_starts_at;
    int value_length;
    int start;
    int end;
    bool is_true;
} WP_HTML_Attribute_Token;

typedef struct
{
    int start;
    int end;
} WP_HTML_Span;

bool ADD_CLASS = true;
bool REMOVE_CLASS = false;
typedef struct
{
    char *class_name;
    int op;
} WP_Classname_Update;

// Define the structure to hold tag information
typedef struct
{
    char *last_query;
    char *sought_tag_name;
    char *sought_class_name;
    int sought_match_offset;
    bool stop_on_tag_closers;
    int bytes_already_parsed;
    int token_starts_at;
    int token_length;
    int tag_name_starts_at;
    int tag_name_length;
    bool is_closing_tag;
    // @TODO: Switch to linkedlists here
    WP_HTML_Attribute_Token *attributes;
    WP_HTML_Span *duplicate_attributes;
    WP_HTML_Span *bookmarks;
    char *html;
} HTML_Tag_Processor_State;

// Function prototypes
void HTML_Tag_Processor_init(HTML_Tag_Processor_State *self, const char *html);
bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State *self, const char *query);
void skip_whitespace(HTML_Tag_Processor_State *self);
char *substr(const char *source, int length);
char *strtolower(const char *str);
bool attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self);
bool duplicate_attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self);

void HTML_Tag_Processor_init(HTML_Tag_Processor_State *self, const char *html)
{
    self->html = strdup(html); // Use strdup to duplicate the string
    self->bytes_already_parsed = 0;
    // Initialize other members as necessary
}

bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State *self, const char *query)
{
    // Implement the logic of parsing the next tag here
    // This will involve parsing the HTML string, which is stored in self->html
    // For now, let's just return false to indicate we haven't implemented this
    return false;
}

// You should also define and implement other methods as needed

bool parse_next_attribute(HTML_Tag_Processor_State *self)
{
    // Skip whitespace and slashes.
    self->bytes_already_parsed += strspn(self->html, " \t\f\r\n/");
    if (self->bytes_already_parsed >= strlen(self->html))
    {
        return false;
    }
    /*
     * Treat the equal sign as a part of the attribute
     * name if it is the first encountered byte.
     *
     * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
     */
    int name_length;
    if (self->html[self->bytes_already_parsed] == '=')
    {
        name_length = 1 + strcspn(self->html + self->bytes_already_parsed + 1, "=/> \t\f\r\n");
    }
    else
    {
        name_length = strcspn(self->html + self->bytes_already_parsed, "=/> \t\f\r\n");
    }

    // No attribute, just tag closer.
    if (0 == name_length || self->bytes_already_parsed + name_length >= strlen(self->html))
    {
        printf("bytes_already_parsed %d.\n", self->bytes_already_parsed);
        printf("No attribute, just tag closer %s.\n", substr(self->html, 10));
        return false;
    }

    int attribute_start = self->bytes_already_parsed;
    char *attribute_name = substr(self->html + attribute_start, name_length);
    self->bytes_already_parsed += name_length;
    if (self->bytes_already_parsed >= strlen(self->html))
    {
        return false;
    }

    skip_whitespace(self);
    if (self->bytes_already_parsed >= strlen(self->html))
    {
        return false;
    }

    bool has_value = '=' == self->html[self->bytes_already_parsed];
    char quote;
    int value_start;
    int value_length;
    int attribute_end;

    if (has_value)
    {
        ++self->bytes_already_parsed;
        skip_whitespace(self);
        if (self->bytes_already_parsed >= strlen(self->html))
        {
            return false;
        }

        if (self->html[self->bytes_already_parsed] == '"' || self->html[self->bytes_already_parsed] == '\'')
        {
            // quote = self->html[self->bytes_already_parsed];
            value_start = self->bytes_already_parsed + 1;
            value_length = strcspn(self->html + value_start, "\"");
            attribute_end = value_start + value_length + 1;
            self->bytes_already_parsed = attribute_end;
        }
        else
        {
            value_start = self->bytes_already_parsed;
            value_length = strcspn(self->html + value_start, "> \t\f\r\n");
            attribute_end = value_start + value_length;
            self->bytes_already_parsed = attribute_end;
        }
    }
    else
    {
        value_start = self->bytes_already_parsed;
        value_length = 0;
        attribute_end = attribute_start + name_length;
    }

    if (attribute_end >= strlen(self->html))
    {
        return false;
    }

    if (self->is_closing_tag)
    {
        return true;
    }

    /*
     * > There must never be two or more attributes on
     * > the same start tag whose names are an ASCII
     * > case-insensitive match for each other.
     *     - HTML 5 spec
     *
     * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
     */
    char *comparable_name = strtolower(attribute_name);

    // If an attribute is listed many times, only use the first declaration and ignore the rest.
    if (!attribute_exists(comparable_name, self))
    {
        WP_HTML_Attribute_Token *token;
        malloc(sizeof(WP_HTML_Attribute_Token));
        token->name = strdup(attribute_name);
        token->start = attribute_start;
        token->end = attribute_end;
        token->value_starts_at = value_start;
        token->value_length = value_length;
        token->is_true = !has_value;

        if (NULL == self->attributes)
        {
            self->attributes = token;
        }
        else
        {
            // WP_HTML_Attribute_Token *new_attributes = malloc(sizeof(self->attributes) + sizeof(token));
            // memcpy(new_attributes, self->attributes, sizeof(self->attributes));
            // new_attributes[sizeof(self->attributes)] = token;
            // self->attributes = new_attributes;
        }

        return true;
    }

    /*
     * Track the duplicate attributes so if we remove it, all disappear together.
     *
     * While `self->duplicated_attributes` could always be stored as an `array()`,
     * which would simplify the logic here, storing a `null` and only allocating
     * an array when encountering duplicates avoids needless allocations in the
     * normative case of parsing tags with no duplicate attributes.
     */
    WP_HTML_Span *duplicate_span;
    malloc(sizeof(WP_HTML_Span));
    duplicate_span->start = attribute_start;
    duplicate_span->end = attribute_end;
    if (NULL == self->duplicate_attributes)
    {
        self->duplicate_attributes = duplicate_span;
    }
    else if (!duplicate_attribute_exists(comparable_name, self))
    {
        // WP_HTML_Span *new_duplicate_attributes = malloc(sizeof(self->duplicate_attributes) + sizeof(duplicate_span));
        // memcpy(new_duplicate_attributes, self->duplicate_attributes, sizeof(self->duplicate_attributes));
        // new_duplicate_attributes[sizeof(self->duplicate_attributes)] = duplicate_span;
        // self->duplicate_attributes = new_duplicate_attributes;
    }
    else
    {
    }

    return true;
}

bool attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self)
{
    if (self->attributes == NULL)
    {
        return false;
    }

    for (int i = 0; i < sizeof(self->attributes); i++)
    {
        if (strcmp(self->attributes[i].name, comparable_name) == 0)
        {
            return true;
        }
    }

    return false;
}

bool duplicate_attribute_exists(char *comparable_name, HTML_Tag_Processor_State *self)
{
    if (self->duplicate_attributes == NULL)
    {
        return false;
    }

    char *duplicate_name;
    for (int i = 0; i < sizeof(self->duplicate_attributes); i++)
    {
        duplicate_name = substr(self->html + self->duplicate_attributes[i].start, self->duplicate_attributes[i].end - self->duplicate_attributes[i].start);
        if (strcmp(duplicate_name, comparable_name) == 0)
        {
            return true;
        }
    }

    return false;
}

char *substr(const char *source, int length)
{
    char *result = malloc(length + 1);
    for (int i = 0; i < length; i++)
    {
        result[i] = source[i];
    }

    result[length] = '\0';
    return result;
}

char *strtolower(const char *str)
{
    size_t length = strlen(str);
    char *result = malloc(length + 1);

    for (size_t i = 0; i < length; i++)
    {
        result[i] = tolower(str[i]);
    }

    result[length] = '\0';
    return result;
}

void skip_whitespace(HTML_Tag_Processor_State *self)
{
    self->bytes_already_parsed += strspn(self->html + self->bytes_already_parsed, " \t\f\r\n");
}

int main()
{
    HTML_Tag_Processor_State *processor = malloc(sizeof(HTML_Tag_Processor_State));
    HTML_Tag_Processor_init(processor, "class=\"foo bar car\" id=\"baz\"");
    bool parsed = parse_next_attribute(processor);

    printf("parse_next_attribute found an attribute: %s \n", (parsed ? "YES" : "NO"));
        printf("html: %s \n", processor->html);
    if (processor->attributes != NULL && sizeof(processor->attributes) > 0)
    {
        printf("Attribute name: %s %d %d value=%s\n",
               processor->attributes[0].name,
               processor->attributes[0].value_starts_at,
               processor->attributes[0].value_length,
               substr(processor->html + processor->attributes[0].value_starts_at, processor->attributes[0].value_length));
    }

    // Clean up
    // Free other dynamically allocated memory as needed

    return 0;
}
	/*
	* This is a fun exploration I've done of porting the Tag Processor API over to C as a PHP extension.
	* For now this is just a part of the parse_next_attribute() function.
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <stdbool.h>
	#include <ctype.h>

	#define MAX_BOOKMARKS 10
	#define MAX_SEEK_OPS 1000

	typedef struct
	{
	char *name;
	int value_starts_at;
	int value_length;
	int start;
	int end;
	bool is_true;
	} WP_HTML_Attribute_Token;

	typedef struct
	{
	int start;
	int end;
	} WP_HTML_Span;

	bool ADD_CLASS = true;
	bool REMOVE_CLASS = false;
	typedef struct
	{
	char *class_name;
	int op;
	} WP_Classname_Update;

	// Define the structure to hold tag information
	typedef struct
	{
	char *last_query;
	char *sought_tag_name;
	char *sought_class_name;
	int sought_match_offset;
	bool stop_on_tag_closers;
	int bytes_already_parsed;
	int token_starts_at;
	int token_length;
	int tag_name_starts_at;
	int tag_name_length;
	bool is_closing_tag;
	// @TODO: Switch to linkedlists here
	WP_HTML_Attribute_Token *attributes;
	WP_HTML_Span *duplicate_attributes;
	WP_HTML_Span *bookmarks;
	char *html;
	} HTML_Tag_Processor_State;

	// Function prototypes
	void HTML_Tag_Processor_init(HTML_Tag_Processor_State self, const char html);
	bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State self, const char query);
	void skip_whitespace(HTML_Tag_Processor_State *self);
	char substr(const char source, int length);
	char strtolower(const char str);
	bool attribute_exists(char comparable_name, HTML_Tag_Processor_State self);
	bool duplicate_attribute_exists(char comparable_name, HTML_Tag_Processor_State self);

	void HTML_Tag_Processor_init(HTML_Tag_Processor_State self, const char html)
	{
	self->html = strdup(html); // Use strdup to duplicate the string
	self->bytes_already_parsed = 0;
	// Initialize other members as necessary
	}

	bool HTML_Tag_Processor_next_tag(HTML_Tag_Processor_State self, const char query)
	{
	// Implement the logic of parsing the next tag here
	// This will involve parsing the HTML string, which is stored in self->html
	// For now, let's just return false to indicate we haven't implemented this
	return false;
	}

	// You should also define and implement other methods as needed

	bool parse_next_attribute(HTML_Tag_Processor_State *self)
	{
	// Skip whitespace and slashes.
	self->bytes_already_parsed += strspn(self->html, " \t\f\r\n/");
	if (self->bytes_already_parsed >= strlen(self->html))
	{
	return false;
	}
	/*
	* Treat the equal sign as a part of the attribute
	* name if it is the first encountered byte.
	*
	* @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
	*/
	int name_length;
	if (self->html[self->bytes_already_parsed] == '=')
	{
	name_length = 1 + strcspn(self->html + self->bytes_already_parsed + 1, "=/> \t\f\r\n");
	}
	else
	{
	name_length = strcspn(self->html + self->bytes_already_parsed, "=/> \t\f\r\n");
	}

	// No attribute, just tag closer.
	if (0 == name_length \|\| self->bytes_already_parsed + name_length >= strlen(self->html))
	{
	printf("bytes_already_parsed %d.\n", self->bytes_already_parsed);
	printf("No attribute, just tag closer %s.\n", substr(self->html, 10));
	return false;
	}

	int attribute_start = self->bytes_already_parsed;
	char *attribute_name = substr(self->html + attribute_start, name_length);
	self->bytes_already_parsed += name_length;
	if (self->bytes_already_parsed >= strlen(self->html))
	{
	return false;
	}

	skip_whitespace(self);
	if (self->bytes_already_parsed >= strlen(self->html))
	{
	return false;
	}

	bool has_value = '=' == self->html[self->bytes_already_parsed];
	char quote;
	int value_start;
	int value_length;
	int attribute_end;

	if (has_value)
	{
	++self->bytes_already_parsed;
	skip_whitespace(self);
	if (self->bytes_already_parsed >= strlen(self->html))
	{
	return false;
	}

	if (self->html[self->bytes_already_parsed] == '"' \|\| self->html[self->bytes_already_parsed] == '\'')
	{
	// quote = self->html[self->bytes_already_parsed];
	value_start = self->bytes_already_parsed + 1;
	value_length = strcspn(self->html + value_start, "\"");
	attribute_end = value_start + value_length + 1;
	self->bytes_already_parsed = attribute_end;
	}
	else
	{
	value_start = self->bytes_already_parsed;
	value_length = strcspn(self->html + value_start, "> \t\f\r\n");
	attribute_end = value_start + value_length;
	self->bytes_already_parsed = attribute_end;
	}
	}
	else
	{
	value_start = self->bytes_already_parsed;
	value_length = 0;
	attribute_end = attribute_start + name_length;
	}

	if (attribute_end >= strlen(self->html))
	{
	return false;
	}

	if (self->is_closing_tag)
	{
	return true;
	}

	/*
	* > There must never be two or more attributes on
	* > the same start tag whose names are an ASCII
	* > case-insensitive match for each other.
	* - HTML 5 spec
	*
	* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
	*/
	char *comparable_name = strtolower(attribute_name);

	// If an attribute is listed many times, only use the first declaration and ignore the rest.
	if (!attribute_exists(comparable_name, self))
	{
	WP_HTML_Attribute_Token *token;
	malloc(sizeof(WP_HTML_Attribute_Token));
	token->name = strdup(attribute_name);
	token->start = attribute_start;
	token->end = attribute_end;
	token->value_starts_at = value_start;
	token->value_length = value_length;
	token->is_true = !has_value;

	if (NULL == self->attributes)
	{
	self->attributes = token;
	}
	else
	{
	// WP_HTML_Attribute_Token *new_attributes = malloc(sizeof(self->attributes) + sizeof(token));
	// memcpy(new_attributes, self->attributes, sizeof(self->attributes));
	// new_attributes[sizeof(self->attributes)] = token;
	// self->attributes = new_attributes;
	}

	return true;
	}

	/*
	* Track the duplicate attributes so if we remove it, all disappear together.
	*
	* While `self->duplicated_attributes` could always be stored as an `array()`,
	* which would simplify the logic here, storing a `null` and only allocating
	* an array when encountering duplicates avoids needless allocations in the
	* normative case of parsing tags with no duplicate attributes.
	*/
	WP_HTML_Span *duplicate_span;
	malloc(sizeof(WP_HTML_Span));
	duplicate_span->start = attribute_start;
	duplicate_span->end = attribute_end;
	if (NULL == self->duplicate_attributes)
	{
	self->duplicate_attributes = duplicate_span;
	}
	else if (!duplicate_attribute_exists(comparable_name, self))
	{
	// WP_HTML_Span *new_duplicate_attributes = malloc(sizeof(self->duplicate_attributes) + sizeof(duplicate_span));
	// memcpy(new_duplicate_attributes, self->duplicate_attributes, sizeof(self->duplicate_attributes));
	// new_duplicate_attributes[sizeof(self->duplicate_attributes)] = duplicate_span;
	// self->duplicate_attributes = new_duplicate_attributes;
	}
	else
	{
	}

	return true;
	}

	bool attribute_exists(char comparable_name, HTML_Tag_Processor_State self)
	{
	if (self->attributes == NULL)
	{
	return false;
	}

	for (int i = 0; i < sizeof(self->attributes); i++)
	{
	if (strcmp(self->attributes[i].name, comparable_name) == 0)
	{
	return true;
	}
	}

	return false;
	}

	bool duplicate_attribute_exists(char comparable_name, HTML_Tag_Processor_State self)
	{
	if (self->duplicate_attributes == NULL)
	{
	return false;
	}

	char *duplicate_name;
	for (int i = 0; i < sizeof(self->duplicate_attributes); i++)
	{
	duplicate_name = substr(self->html + self->duplicate_attributes[i].start, self->duplicate_attributes[i].end - self->duplicate_attributes[i].start);
	if (strcmp(duplicate_name, comparable_name) == 0)
	{
	return true;
	}
	}

	return false;
	}

	char substr(const char source, int length)
	{
	char *result = malloc(length + 1);
	for (int i = 0; i < length; i++)
	{
	result[i] = source[i];
	}

	result[length] = '\0';
	return result;
	}

	char strtolower(const char str)
	{
	size_t length = strlen(str);
	char *result = malloc(length + 1);

	for (size_t i = 0; i < length; i++)
	{
	result[i] = tolower(str[i]);
	}

	result[length] = '\0';
	return result;
	}

	void skip_whitespace(HTML_Tag_Processor_State *self)
	{
	self->bytes_already_parsed += strspn(self->html + self->bytes_already_parsed, " \t\f\r\n");
	}

	int main()
	{
	HTML_Tag_Processor_State *processor = malloc(sizeof(HTML_Tag_Processor_State));
	HTML_Tag_Processor_init(processor, "class=\"foo bar car\" id=\"baz\"");
	bool parsed = parse_next_attribute(processor);

	printf("parse_next_attribute found an attribute: %s \n", (parsed ? "YES" : "NO"));
	printf("html: %s \n", processor->html);
	if (processor->attributes != NULL && sizeof(processor->attributes) > 0)
	{
	printf("Attribute name: %s %d %d value=%s\n",
	processor->attributes[0].name,
	processor->attributes[0].value_starts_at,
	processor->attributes[0].value_length,
	substr(processor->html + processor->attributes[0].value_starts_at, processor->attributes[0].value_length));
	}

	// Clean up
	// Free other dynamically allocated memory as needed

	return 0;
	}