Created
April 27, 2014 01:46
-
-
Save CurtisFenner/11335743 to your computer and use it in GitHub Desktop.
HTML Parser Beginning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <windows.h> | |
#include <stdbool.h> | |
typedef struct docstring docstring; | |
typedef struct wdocument wdocument; | |
typedef struct wattribute wattribute; | |
struct wattribute { | |
docstring* name; | |
docstring* value; | |
wattribute* next; | |
}; | |
struct docstring { | |
wdocument* document; | |
long start; | |
long length; | |
long slen; //Non negative means use STR instead of document | |
char* str; | |
//This is the string value, if | |
//not sourceable from document. | |
}; | |
typedef struct wtag wtag; | |
struct wtag { | |
wtag* previousSibling; | |
wtag* nextSibling; | |
wtag* firstChild; | |
wtag* parent; | |
wattribute* firstattribute; | |
docstring* data; //Text content, for such tags. | |
docstring* tag; //Includes <> | |
docstring* tagname; //Just the word (guaranteed lowercase) | |
}; | |
struct lengthed { | |
long size; | |
char* place; | |
}; | |
struct wdocument { | |
wtag* root; | |
char* text; | |
long length; | |
}; | |
struct lengthed fileIntoString(FILE *file) { | |
fseek(file, 0, SEEK_END); | |
long size; | |
size = ftell(file); | |
rewind(file); | |
char *str = malloc((size+1) * (sizeof(char))); | |
fread(str, sizeof(char), size, file); | |
str[size] = '\0'; | |
fclose(file); | |
struct lengthed returnable = {}; | |
returnable.size = size; | |
returnable.place = str; | |
return returnable; | |
} | |
void showDocString(docstring j) { | |
if (j.slen > 0) { | |
for (int i = 0; i < j.slen; i++) { | |
printf("%c", *(j.str + i)); | |
} | |
} else { | |
int start = j.start; | |
int length = j.length; | |
for (int i = start; i < start + length; i++) { | |
printf("%c",(*(j.document)).text[i]); | |
} | |
} | |
} | |
void showTag(wtag t) { | |
printf("\n"); | |
showDocString(*t.tag); | |
printf("\n"); | |
if (t.previousSibling != NULL) { | |
printf("\tPrev:\t"); | |
docstring* g = (*(t.previousSibling)).tag; | |
docstring u = *g; | |
showDocString(u); | |
printf("\n"); | |
} else { | |
printf("\tPrev:\tNULL\n"); | |
} | |
if (t.nextSibling != NULL) { | |
printf("\tNext:\t"); | |
docstring *g = (*(t.nextSibling)).tag; | |
showDocString(*g); | |
printf("\n"); | |
} else { | |
printf("\tNext:\tNULL\n"); | |
} | |
if (t.parent != NULL) { | |
printf("\tParent:\t"); | |
docstring *g = (*(t.parent)).tag; | |
showDocString(*g); | |
printf("\n"); | |
} else { | |
printf("\tParent:\tNULL\n"); | |
} | |
} | |
int isSelfClosing(docstring s) { | |
//Takes a docstring of the tag NAME not the tag itself. | |
char first = (*(s.document)).text[s.start+0]; | |
char second = (*(s.document)).text[s.start+1]; | |
long len = s.length; | |
if (first == 'a' && second == 'r' && len == 4) { | |
return true; //Area | |
} | |
if (first == 'b' && second == 'a') { | |
return true; //Base | |
} | |
if (first == 'b' && len == 2) { | |
return true; //BR | |
} | |
if (first == 'c' && second == 'o' && len == 3) { | |
return true; //col | |
} | |
if (first == 'e' && len == 5) { | |
return true; //Embed | |
} | |
if (first == 'h' && second == 'r') { | |
return true; //hr | |
} | |
if (first == 'i' && second == 'm') { | |
return true; //img | |
} | |
if (first == 'i' && second == 'n' && len == 5) { | |
return true; //input | |
} | |
if (first == 'l' && len == 4) { | |
return true;//link | |
} | |
if (first == 'm' && second == 'e') { | |
char third = (*(s.document)).text[s.start+3]; | |
if (third == 't') { | |
return true; //meta | |
} | |
} | |
if (first == 'p' && len == 5) { | |
return true; //param | |
} | |
if (first == 's' && second == 'o') { | |
return true; //source | |
} | |
return false; | |
} | |
wdocument html_document(FILE* infile) { | |
char *file; | |
struct lengthed from = fileIntoString(infile); | |
file = from.place; | |
for (int i = 0; i < from.size; i++) { | |
char c = file[i]; | |
if (c == '\t' || c == '\n' || c == '\r') { | |
file[i] = ' '; | |
} | |
} | |
wdocument document = {}; | |
document.text = file; | |
document.length = from.size; | |
wtag root; | |
document.root = &root; | |
root.parent = NULL; | |
root.nextSibling = NULL; | |
root.firstChild = NULL; | |
root.previousSibling = NULL; | |
docstring *rootname = (malloc(sizeof(docstring))); | |
(*rootname).slen = 9; | |
(*rootname).str = &("#document"); | |
(*rootname).document = &document; | |
root.tagname = rootname; | |
root.tag = rootname; | |
const int WAIT_FOR_OPEN_TAG = 0; //Waits for a tag to start (waiting for <) | |
const int WAIT_FOR_NAME_END = 1; //Waits for a tag's name to end either at a space or > | |
const int WAIT_FOR_ATTRIBUTE = 8; | |
const int WAIT_FOR_ATTRIBUTE_END = 2; //either =, space, or > to mark end of attribute name. | |
const int WAIT_FOR_TAG_END = 3; //Waits for either > or a letter. | |
const int WAIT_FOR_SCRIPT_CLOSE = 7; //Waits for the string </script> to declare a script tag closed. | |
const int WAIT_FOR_ATTRIBUTE_VALUE = 9; | |
const int WAIT_FOR_ATTRIBUTE_QUOTING = 10; | |
const int CLOSE_PARENT = -1; //Close the parent tag | |
const int OPEN_NEW = -2; //Close the previous tag | |
const int DO_NOTHING = -3; //Do nothing (e.g., </p> if i was being | |
//really compliant and liked to do this for some reason) | |
const int BE_SIBLING = -4; // E.g., <br><br><br>.. | |
docstring TEXTNODE; | |
TEXTNODE.document = &document; | |
TEXTNODE.slen = 5; | |
TEXTNODE.str = &("#text"); | |
int within[20]; | |
for (int i = 0; i < 20; i++) { | |
within[i] = 0; //Debugging only | |
} | |
wtag* previous = NULL; //The previous sibling. | |
wtag* parent = &root; | |
wtag* now = NULL; | |
wattribute* attr = NULL; | |
wattribute* pattr = NULL; | |
//The previous attribute on the current node. | |
//To be reset upon creating a new `now` | |
int nmode = 0; | |
for (int i = 0; i < document.length; i++) { | |
char c = file[i]; | |
int mode = nmode; | |
within[mode]++; | |
if (mode == WAIT_FOR_OPEN_TAG) { | |
//Waits for a tag to start (waiting for <) | |
if (c == '<') { | |
if (now != NULL) { | |
//A text node that we have to append in. | |
if (previous == NULL) { | |
(*parent).firstChild = now; | |
previous = now; | |
} else { | |
(*previous).nextSibling = now; | |
previous = now; | |
} | |
} | |
now = malloc(sizeof(wtag)); | |
(*now).parent = NULL; | |
(*now).previousSibling = NULL; | |
(*now).nextSibling = NULL; | |
(*now).tag = malloc(sizeof(docstring)); | |
(*(*now).tag).slen = 0; | |
(*(*now).tag).document = &document; | |
(*(*now).tag).start = i; | |
(*now).tagname = malloc(sizeof(docstring)); | |
(*(*now).tagname).slen = 0; | |
(*(*now).tagname).document = &document; | |
(*(*now).tagname).start = i + 1; | |
nmode = WAIT_FOR_NAME_END; | |
} else { | |
if (now != NULL) { | |
//We don't really have anything to do, except perhaps update the length. | |
(* now -> data).length = i + 1 - (*(*now).data).start; | |
} else { | |
now = malloc(sizeof(wtag)); | |
now -> parent = parent; | |
now -> previousSibling = previous; | |
now -> nextSibling = NULL; | |
now -> tag = &TEXTNODE; | |
now -> tagname = &TEXTNODE; | |
now -> data = malloc(sizeof(docstring)); | |
(* now -> data).start = i; | |
pattr = NULL; | |
attr = NULL; | |
} | |
} | |
} | |
if (mode == WAIT_FOR_ATTRIBUTE_END) { | |
if (c == ' ') { | |
//The attribute's name is over. | |
//We now have to wait for: | |
//a) = sign (write attribute value) | |
//b) letter (start new tag, set up this one to copy value from name) | |
//c) > (finish tag) | |
attr -> name -> length = i - attr -> name -> start; | |
nmode = WAIT_FOR_ATTRIBUTE_VALUE; | |
} | |
if (c == '=') { | |
//Wait for attribute value. | |
attr -> name -> length = i - attr -> name -> start; | |
mode = WAIT_FOR_ATTRIBUTE_VALUE; | |
} | |
} | |
if (mode == WAIT_FOR_ATTRIBUTE_QUOTING) { | |
} | |
if (mode == WAIT_FOR_ATTRIBUTE_VALUE) { | |
if (c == '=') { | |
//We have a specified value | |
nmode = WAIT_FOR_ATTRIBUTE_QUOTING; | |
} | |
if (c >= 'a' && c <= 'z') { | |
//We have not hit an =, so this is a new attribute. | |
//Specify value to be the same as the name | |
attr -> value = attr -> name; | |
mode = WAIT_FOR_ATTRIBUTE; //Process accordingly | |
pattr = attr; | |
} | |
} | |
if (mode == WAIT_FOR_ATTRIBUTE) { | |
if (c >= 'a' && c <= 'z') { | |
attr = malloc(sizeof(wattribute)); | |
if (pattr != NULL) { | |
patter -> next = attr; | |
} | |
attr -> name = malloc(sizeof(docstring)); | |
attr -> name -> document = &document; | |
attr -> value = malloc(sizeof(docstring)); | |
attr -> value -> document = &document; | |
attr -> next = NULL; | |
if (now -> firstattribute == NULL) { | |
now -> firstattribute = attr; | |
} | |
nmode = WAIT_FOR_ATTRIBUTE_END; | |
} | |
if (c == '>') { | |
mode = WAIT_FOR_NAME_END; //Finish up the tag. | |
} | |
} | |
if (mode == WAIT_FOR_NAME_END) { | |
//Waits for a tag's name to end either at a space or > | |
if (c == ' ') { | |
//Attributes TODO: | |
nmode = WAIT_FOR_ATTRIBUTE; | |
} | |
if (c == '>') { | |
//The tag is over. | |
(*(*now).tag).length = i - (*(*now).tag).start + 1; //Includes < and > | |
(*(*now).tagname).length = i - (*(*now).tagname).start; //Includes < and > | |
char firstLetter = document.text[ (*(*now).tag).start + 1 ]; | |
int action = OPEN_NEW; | |
if (firstLetter == '!') { | |
action = BE_SIBLING; | |
} | |
if (firstLetter == '/') { | |
action = CLOSE_PARENT; | |
} | |
if (isSelfClosing(*(*now).tagname)) { | |
action = BE_SIBLING; | |
} | |
if (action == BE_SIBLING || action == OPEN_NEW) { | |
if (previous != NULL) { | |
(*previous).nextSibling = now; | |
(*now).parent = (*previous).parent; | |
(*now).previousSibling = previous; | |
previous = now; | |
} else { | |
previous = now; | |
(*parent).firstChild = now; | |
(*now).parent = parent; | |
} | |
} | |
if (action == OPEN_NEW) { | |
previous = NULL; | |
parent = now; | |
} | |
if (action == CLOSE_PARENT) { | |
free(now); | |
now = NULL; | |
previous = parent; | |
parent = (*parent).parent; | |
} | |
if (now != NULL) { | |
showTag(*now); | |
} | |
now = NULL; | |
nmode = WAIT_FOR_OPEN_TAG; | |
} | |
} | |
} | |
if (now != NULL) { | |
//Dangling text node | |
previous -> nextSibling = now; | |
} | |
printf("\n\nWithin:\n"); | |
for (int i = 0; i < 20; i++) { | |
printf(" %i ",within[i]); | |
} | |
printf("\nDocument Length: %i",document.length); | |
return document; | |
} | |
int main() { | |
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); | |
CONSOLE_SCREEN_BUFFER_INFO consoleInfo; | |
WORD saved_attributes; | |
/* Save current attributes */ | |
GetConsoleScreenBufferInfo(hConsole, &consoleInfo); | |
saved_attributes = consoleInfo.wAttributes; | |
SetConsoleTextAttribute(hConsole, FOREGROUND_BLUE); | |
printf("This is some nice COLORFUL text, isn't it?\n"); | |
/* Restore original attributes */ | |
SetConsoleTextAttribute(hConsole, saved_attributes); | |
wdocument doc = html_document(fopen("simple.html","rb")); | |
printf("\n\n\n\n\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment