Created
February 10, 2022 22:03
-
-
Save sneppy/3f92693eb109052b97fb8c1bcb30ce05 to your computer and use it in GitHub Desktop.
A simple non-compliant XML parser implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string> | |
#include <vector> | |
#include <set> | |
#define FORWARD(x) std::forward<decltype((x))>((x)) | |
#define XML_TOKEN_WHITESPACE(x) ((x) == ' ' || (x) == '\t' || (x) == '\n' || (x) == '\r') | |
#define XML_TOKEN_UPPER_ALPHA(x) ((x) >= 'A' && (x) <= 'Z') | |
#define XML_TOKEN_LOWER_ALPHA(x) ((x) >= 'a' && (x) <= 'z') | |
#define XML_TOKEN_ALPHA(x) (XML_TOKEN_UPPER_ALPHA(x) || XML_TOKEN_LOWER_ALPHA(x)) | |
#define XML_TOKEN_DIGIT(x) ((x) >= '0' && (x) <= '9') | |
#define XML_TOKEN_WORD(x) (XML_TOKEN_ALPHA(x) || XML_TOKEN_DIGIT(x) || (x) == '_') | |
#define XML_TOKEN_NAME(x) (XML_TOKEN_WORD(x) || (x) == '.' || (x) == '-' || (x) == ':') | |
#define XML_TOKEN_NAME_BEGIN(x) (XML_TOKEN_ALPHA(x) || (x) == '_' || (x) == ':') | |
struct XMLToken | |
{ | |
enum Type | |
{ | |
Type_None, | |
Type_EOF, | |
Type_Whitespace, | |
Type_Name, | |
Type_LT, | |
Type_GT, | |
Type_EQ, | |
Type_Colon, | |
Type_Slash, | |
Type_Backslash, | |
Type_Plus, | |
Type_Minus, | |
Type_Quote, | |
Type_DQuote | |
}; | |
/* The type of the token. */ | |
Type type = Type_None; | |
/* The offset of the beginning of the token in number of characters. */ | |
size_t beginOffset; | |
/* The offset of the end of the token, in number of characters. */ | |
size_t endOffset; | |
inline char const* begin(char const* const markup) const | |
{ | |
return markup + beginOffset; | |
} | |
inline char const* end(char const* const markup) const | |
{ | |
return markup + endOffset; | |
} | |
}; | |
class XMLTokenizer | |
{ | |
public: | |
inline XMLTokenizer(char const* xml) | |
: markup{xml} | |
, tok{} | |
, offset{0} | |
{ | |
// Point to first token | |
next(); | |
} | |
inline XMLToken const& operator*() const | |
{ | |
return tok; | |
} | |
inline XMLToken const* operator->() const | |
{ | |
return &(**this); | |
} | |
inline char const* begin() const | |
{ | |
return tok.begin(markup); | |
} | |
inline char const* end() const | |
{ | |
return tok.end(markup); | |
} | |
inline XMLTokenizer& operator++() | |
{ | |
next(); | |
return *this; | |
} | |
inline XMLTokenizer operator++(int) | |
{ | |
XMLTokenizer other{*this}; | |
++(*this); | |
return other; | |
} | |
inline bool read(XMLToken::Type type, bool const skipWhitespace = false) | |
{ | |
while (skipWhitespace && tok.type == XMLToken::Type_Whitespace) | |
next(); | |
if (tok.type == type) | |
{ | |
next(); | |
return true; | |
} | |
return false; | |
} | |
protected: | |
/* The XML markup string. */ | |
char const* markup; | |
/* The current token. */ | |
XMLToken tok; | |
/* Current offset in the markup. */ | |
size_t offset; | |
void next() | |
{ | |
// Reset token type | |
tok.type = XMLToken::Type_None; | |
while (tok.type == XMLToken::Type_None) | |
{ | |
tok.beginOffset = offset; | |
tok.endOffset = offset + 1; | |
switch (markup[offset]) | |
{ | |
case '\0': | |
tok.type = XMLToken::Type_EOF; | |
break; | |
case '<': | |
tok.type = XMLToken::Type_LT; | |
break; | |
case '>': | |
tok.type = XMLToken::Type_GT; | |
break; | |
case '=': | |
tok.type = XMLToken::Type_EQ; | |
break; | |
case '/': | |
tok.type = XMLToken::Type_Slash; | |
break; | |
case '\\': | |
tok.type = XMLToken::Type_Backslash; | |
break; | |
case '+': | |
tok.type = XMLToken::Type_Plus; | |
break; | |
case '-': | |
tok.type = XMLToken::Type_Minus; | |
break; | |
case '\'': | |
tok.type = XMLToken::Type_Quote; | |
break; | |
case '"': | |
tok.type = XMLToken::Type_DQuote; | |
break; | |
default: | |
{ | |
if (XML_TOKEN_WHITESPACE(markup[offset])) | |
{ | |
tok.type = XMLToken::Type_Whitespace; | |
for (; XML_TOKEN_WHITESPACE(markup[tok.endOffset]); tok.endOffset++); | |
break; | |
} | |
if (XML_TOKEN_NAME_BEGIN(markup[offset])) | |
{ | |
tok.type = XMLToken::Type_Name; | |
for (; XML_TOKEN_NAME(markup[tok.endOffset]); tok.endOffset++); | |
break; | |
} | |
} | |
break; | |
} | |
// Adjust offset | |
offset = tok.endOffset; | |
} | |
} | |
}; | |
template<typename VisitorT> | |
static bool xmlParse_ElementAttr(XMLTokenizer& tokenizer, [[maybe_unused]]VisitorT&& visitor) | |
{ | |
std::string attrName, attrVal; | |
if (!tokenizer.read(XMLToken::Type_Whitespace)) | |
return false; | |
if (tokenizer->type != XMLToken::Type_Name) | |
return false; | |
attrName = {tokenizer.begin(), tokenizer.end()}; | |
tokenizer++; | |
if (tokenizer.read(XMLToken::Type_EQ)) | |
{ | |
if (tokenizer->type != XMLToken::Type_Quote && tokenizer->type != XMLToken::Type_DQuote) | |
return false; | |
auto quoteType = tokenizer->type; | |
tokenizer++; | |
char const* beginVal = tokenizer.begin(); | |
bool escape = false; | |
while ((tokenizer->type != quoteType || escape) && tokenizer->type != XMLToken::Type_EOF) | |
{ | |
escape = tokenizer->type == XMLToken::Type_Backslash; | |
tokenizer++; | |
} | |
char const* endVal = tokenizer.begin(); | |
if (!tokenizer.read(quoteType)) | |
return false; | |
attrVal = {beginVal, endVal}; | |
} | |
visitor.elementAttr(std::move(attrName), std::move(attrVal)); | |
return true; | |
} | |
template<typename VisitorT> | |
static bool xmlParse_ElementAttrs(XMLTokenizer& tokenizer, [[maybe_unused]]VisitorT&& visitor) | |
{ | |
for (XMLTokenizer checkpoint{tokenizer}; xmlParse_ElementAttr(checkpoint, FORWARD(visitor));) | |
{ | |
tokenizer = checkpoint; | |
} | |
return true; | |
} | |
template<typename VisitorT> | |
static bool xmlParse_ElementContent(XMLTokenizer& tokenizer, VisitorT&& visitor) | |
{ | |
while (tokenizer->type != XMLToken::Type_LT) | |
tokenizer++; | |
// TODO: Save text content | |
for (XMLTokenizer checkpoint{tokenizer}; xmlParse_Element(checkpoint, FORWARD(visitor));) | |
{ | |
tokenizer = checkpoint; | |
} | |
return true; | |
} | |
template<typename VisitorT> | |
static bool xmlParse_Element(XMLTokenizer& tokenizer, VisitorT&& visitor) | |
{ | |
if (!tokenizer.read(XMLToken::Type_LT, true)) | |
return false; | |
if (tokenizer->type != XMLToken::Type_Name) | |
return false; | |
std::string tag = {tokenizer.begin(), tokenizer.end()}; | |
visitor.elementBegin(tag); | |
tokenizer++; | |
if (!xmlParse_ElementAttrs(tokenizer, FORWARD(visitor))) | |
return false; | |
if (tokenizer.read(XMLToken::Type_Slash, true) && tokenizer.read(XMLToken::Type_GT)) | |
; | |
else | |
{ | |
if (!tokenizer.read(XMLToken::Type_GT, true)) | |
return false; | |
if (!xmlParse_ElementContent(tokenizer, FORWARD(visitor))) | |
return false; | |
if (!(tokenizer.read(XMLToken::Type_LT, true) && tokenizer.read(XMLToken::Type_Slash))) | |
return false; | |
if (tokenizer->type != XMLToken::Type_Name || !std::equal(tag.begin(), tag.end(), tokenizer.begin())) | |
return false; | |
tokenizer++; | |
if (!tokenizer.read(XMLToken::Type_GT, true)) | |
return false; | |
} | |
visitor.elementEnd(tag); | |
return true; | |
} | |
template<typename VisitorT> | |
static void xmlParse_Document(XMLTokenizer& tokenizer, VisitorT&& visitor) | |
{ | |
xmlParse_Element(tokenizer, FORWARD(visitor)); | |
} | |
template<typename VisitorT> | |
static void xmlParse(char const* markup, VisitorT&& visitor) | |
{ | |
XMLTokenizer tokenizer{markup}; | |
xmlParse_Document(tokenizer, FORWARD(visitor)); | |
} | |
static char const sample[] = | |
"<data>" | |
" <country name=\"Liechtenstein\">" | |
" <rank>1</rank>" | |
" <year>2008</year>" | |
" <gdppc>141100</gdppc>" | |
" <neighbor name=\"Austria\" direction=\"E\"/>" | |
" <neighbor name=\"Switzerland\" direction=\"W\"/>" | |
" </country>" | |
" <country name=\"Singapore\">" | |
" <rank>4</rank>" | |
" <year>2011</year>" | |
" <gdppc>59900</gdppc>" | |
" <neighbor name=\"Malaysia\" direction=\"N\"/>" | |
" </country>" | |
" <country name=\"Panama\">" | |
" <rank>68</rank>" | |
" <year>2011</year>" | |
" <gdppc>13600</gdppc>" | |
" <neighbor name=\"Costa Rica\" direction=\"W\"/>" | |
" <neighbor name=\"Colombia\" direction=\"E\"/>" | |
" </country>" | |
"</data>"; | |
int main() | |
{ | |
struct MaterialXMLParser | |
{ | |
void elementBegin(std::string const& tag) | |
{ | |
printf("[%s", tag.c_str()); | |
} | |
void elementAttr(std::string&& name, std::string&& val) | |
{ | |
printf(" %s=%s", name.c_str(), val.c_str()); | |
} | |
void elementEnd(std::string const& tag) | |
{ | |
printf("]"); | |
} | |
}; | |
xmlParse(sample, MaterialXMLParser{}); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment