Skip to content

Instantly share code, notes, and snippets.

@CurtisFenner
Created April 27, 2014 01:46
Show Gist options
  • Save CurtisFenner/11335743 to your computer and use it in GitHub Desktop.
Save CurtisFenner/11335743 to your computer and use it in GitHub Desktop.
HTML Parser Beginning
#include <stdio.h>
#include <windows.h>
#include <stdbool.h>
typedef struct docstring docstring;
typedef struct wdocument wdocument;
typedef struct wattribute wattribute;
struct wattribute {
docstring* name;
docstring* value;
wattribute* next;
};
struct docstring {
wdocument* document;
long start;
long length;
long slen; //Non negative means use STR instead of document
char* str;
//This is the string value, if
//not sourceable from document.
};
typedef struct wtag wtag;
struct wtag {
wtag* previousSibling;
wtag* nextSibling;
wtag* firstChild;
wtag* parent;
wattribute* firstattribute;
docstring* data; //Text content, for such tags.
docstring* tag; //Includes <>
docstring* tagname; //Just the word (guaranteed lowercase)
};
struct lengthed {
long size;
char* place;
};
struct wdocument {
wtag* root;
char* text;
long length;
};
struct lengthed fileIntoString(FILE *file) {
fseek(file, 0, SEEK_END);
long size;
size = ftell(file);
rewind(file);
char *str = malloc((size+1) * (sizeof(char)));
fread(str, sizeof(char), size, file);
str[size] = '\0';
fclose(file);
struct lengthed returnable = {};
returnable.size = size;
returnable.place = str;
return returnable;
}
void showDocString(docstring j) {
if (j.slen > 0) {
for (int i = 0; i < j.slen; i++) {
printf("%c", *(j.str + i));
}
} else {
int start = j.start;
int length = j.length;
for (int i = start; i < start + length; i++) {
printf("%c",(*(j.document)).text[i]);
}
}
}
void showTag(wtag t) {
printf("\n");
showDocString(*t.tag);
printf("\n");
if (t.previousSibling != NULL) {
printf("\tPrev:\t");
docstring* g = (*(t.previousSibling)).tag;
docstring u = *g;
showDocString(u);
printf("\n");
} else {
printf("\tPrev:\tNULL\n");
}
if (t.nextSibling != NULL) {
printf("\tNext:\t");
docstring *g = (*(t.nextSibling)).tag;
showDocString(*g);
printf("\n");
} else {
printf("\tNext:\tNULL\n");
}
if (t.parent != NULL) {
printf("\tParent:\t");
docstring *g = (*(t.parent)).tag;
showDocString(*g);
printf("\n");
} else {
printf("\tParent:\tNULL\n");
}
}
int isSelfClosing(docstring s) {
//Takes a docstring of the tag NAME not the tag itself.
char first = (*(s.document)).text[s.start+0];
char second = (*(s.document)).text[s.start+1];
long len = s.length;
if (first == 'a' && second == 'r' && len == 4) {
return true; //Area
}
if (first == 'b' && second == 'a') {
return true; //Base
}
if (first == 'b' && len == 2) {
return true; //BR
}
if (first == 'c' && second == 'o' && len == 3) {
return true; //col
}
if (first == 'e' && len == 5) {
return true; //Embed
}
if (first == 'h' && second == 'r') {
return true; //hr
}
if (first == 'i' && second == 'm') {
return true; //img
}
if (first == 'i' && second == 'n' && len == 5) {
return true; //input
}
if (first == 'l' && len == 4) {
return true;//link
}
if (first == 'm' && second == 'e') {
char third = (*(s.document)).text[s.start+3];
if (third == 't') {
return true; //meta
}
}
if (first == 'p' && len == 5) {
return true; //param
}
if (first == 's' && second == 'o') {
return true; //source
}
return false;
}
wdocument html_document(FILE* infile) {
char *file;
struct lengthed from = fileIntoString(infile);
file = from.place;
for (int i = 0; i < from.size; i++) {
char c = file[i];
if (c == '\t' || c == '\n' || c == '\r') {
file[i] = ' ';
}
}
wdocument document = {};
document.text = file;
document.length = from.size;
wtag root;
document.root = &root;
root.parent = NULL;
root.nextSibling = NULL;
root.firstChild = NULL;
root.previousSibling = NULL;
docstring *rootname = (malloc(sizeof(docstring)));
(*rootname).slen = 9;
(*rootname).str = &("#document");
(*rootname).document = &document;
root.tagname = rootname;
root.tag = rootname;
const int WAIT_FOR_OPEN_TAG = 0; //Waits for a tag to start (waiting for <)
const int WAIT_FOR_NAME_END = 1; //Waits for a tag's name to end either at a space or >
const int WAIT_FOR_ATTRIBUTE = 8;
const int WAIT_FOR_ATTRIBUTE_END = 2; //either =, space, or > to mark end of attribute name.
const int WAIT_FOR_TAG_END = 3; //Waits for either > or a letter.
const int WAIT_FOR_SCRIPT_CLOSE = 7; //Waits for the string </script> to declare a script tag closed.
const int WAIT_FOR_ATTRIBUTE_VALUE = 9;
const int WAIT_FOR_ATTRIBUTE_QUOTING = 10;
const int CLOSE_PARENT = -1; //Close the parent tag
const int OPEN_NEW = -2; //Close the previous tag
const int DO_NOTHING = -3; //Do nothing (e.g., </p> if i was being
//really compliant and liked to do this for some reason)
const int BE_SIBLING = -4; // E.g., <br><br><br>..
docstring TEXTNODE;
TEXTNODE.document = &document;
TEXTNODE.slen = 5;
TEXTNODE.str = &("#text");
int within[20];
for (int i = 0; i < 20; i++) {
within[i] = 0; //Debugging only
}
wtag* previous = NULL; //The previous sibling.
wtag* parent = &root;
wtag* now = NULL;
wattribute* attr = NULL;
wattribute* pattr = NULL;
//The previous attribute on the current node.
//To be reset upon creating a new `now`
int nmode = 0;
for (int i = 0; i < document.length; i++) {
char c = file[i];
int mode = nmode;
within[mode]++;
if (mode == WAIT_FOR_OPEN_TAG) {
//Waits for a tag to start (waiting for <)
if (c == '<') {
if (now != NULL) {
//A text node that we have to append in.
if (previous == NULL) {
(*parent).firstChild = now;
previous = now;
} else {
(*previous).nextSibling = now;
previous = now;
}
}
now = malloc(sizeof(wtag));
(*now).parent = NULL;
(*now).previousSibling = NULL;
(*now).nextSibling = NULL;
(*now).tag = malloc(sizeof(docstring));
(*(*now).tag).slen = 0;
(*(*now).tag).document = &document;
(*(*now).tag).start = i;
(*now).tagname = malloc(sizeof(docstring));
(*(*now).tagname).slen = 0;
(*(*now).tagname).document = &document;
(*(*now).tagname).start = i + 1;
nmode = WAIT_FOR_NAME_END;
} else {
if (now != NULL) {
//We don't really have anything to do, except perhaps update the length.
(* now -> data).length = i + 1 - (*(*now).data).start;
} else {
now = malloc(sizeof(wtag));
now -> parent = parent;
now -> previousSibling = previous;
now -> nextSibling = NULL;
now -> tag = &TEXTNODE;
now -> tagname = &TEXTNODE;
now -> data = malloc(sizeof(docstring));
(* now -> data).start = i;
pattr = NULL;
attr = NULL;
}
}
}
if (mode == WAIT_FOR_ATTRIBUTE_END) {
if (c == ' ') {
//The attribute's name is over.
//We now have to wait for:
//a) = sign (write attribute value)
//b) letter (start new tag, set up this one to copy value from name)
//c) > (finish tag)
attr -> name -> length = i - attr -> name -> start;
nmode = WAIT_FOR_ATTRIBUTE_VALUE;
}
if (c == '=') {
//Wait for attribute value.
attr -> name -> length = i - attr -> name -> start;
mode = WAIT_FOR_ATTRIBUTE_VALUE;
}
}
if (mode == WAIT_FOR_ATTRIBUTE_QUOTING) {
}
if (mode == WAIT_FOR_ATTRIBUTE_VALUE) {
if (c == '=') {
//We have a specified value
nmode = WAIT_FOR_ATTRIBUTE_QUOTING;
}
if (c >= 'a' && c <= 'z') {
//We have not hit an =, so this is a new attribute.
//Specify value to be the same as the name
attr -> value = attr -> name;
mode = WAIT_FOR_ATTRIBUTE; //Process accordingly
pattr = attr;
}
}
if (mode == WAIT_FOR_ATTRIBUTE) {
if (c >= 'a' && c <= 'z') {
attr = malloc(sizeof(wattribute));
if (pattr != NULL) {
patter -> next = attr;
}
attr -> name = malloc(sizeof(docstring));
attr -> name -> document = &document;
attr -> value = malloc(sizeof(docstring));
attr -> value -> document = &document;
attr -> next = NULL;
if (now -> firstattribute == NULL) {
now -> firstattribute = attr;
}
nmode = WAIT_FOR_ATTRIBUTE_END;
}
if (c == '>') {
mode = WAIT_FOR_NAME_END; //Finish up the tag.
}
}
if (mode == WAIT_FOR_NAME_END) {
//Waits for a tag's name to end either at a space or >
if (c == ' ') {
//Attributes TODO:
nmode = WAIT_FOR_ATTRIBUTE;
}
if (c == '>') {
//The tag is over.
(*(*now).tag).length = i - (*(*now).tag).start + 1; //Includes < and >
(*(*now).tagname).length = i - (*(*now).tagname).start; //Includes < and >
char firstLetter = document.text[ (*(*now).tag).start + 1 ];
int action = OPEN_NEW;
if (firstLetter == '!') {
action = BE_SIBLING;
}
if (firstLetter == '/') {
action = CLOSE_PARENT;
}
if (isSelfClosing(*(*now).tagname)) {
action = BE_SIBLING;
}
if (action == BE_SIBLING || action == OPEN_NEW) {
if (previous != NULL) {
(*previous).nextSibling = now;
(*now).parent = (*previous).parent;
(*now).previousSibling = previous;
previous = now;
} else {
previous = now;
(*parent).firstChild = now;
(*now).parent = parent;
}
}
if (action == OPEN_NEW) {
previous = NULL;
parent = now;
}
if (action == CLOSE_PARENT) {
free(now);
now = NULL;
previous = parent;
parent = (*parent).parent;
}
if (now != NULL) {
showTag(*now);
}
now = NULL;
nmode = WAIT_FOR_OPEN_TAG;
}
}
}
if (now != NULL) {
//Dangling text node
previous -> nextSibling = now;
}
printf("\n\nWithin:\n");
for (int i = 0; i < 20; i++) {
printf(" %i ",within[i]);
}
printf("\nDocument Length: %i",document.length);
return document;
}
int main() {
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
CONSOLE_SCREEN_BUFFER_INFO consoleInfo;
WORD saved_attributes;
/* Save current attributes */
GetConsoleScreenBufferInfo(hConsole, &consoleInfo);
saved_attributes = consoleInfo.wAttributes;
SetConsoleTextAttribute(hConsole, FOREGROUND_BLUE);
printf("This is some nice COLORFUL text, isn't it?\n");
/* Restore original attributes */
SetConsoleTextAttribute(hConsole, saved_attributes);
wdocument doc = html_document(fopen("simple.html","rb"));
printf("\n\n\n\n\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment