Skip to content

Instantly share code, notes, and snippets.

@sneppy
Created February 10, 2022 22:03
Show Gist options
  • Save sneppy/3f92693eb109052b97fb8c1bcb30ce05 to your computer and use it in GitHub Desktop.
Save sneppy/3f92693eb109052b97fb8c1bcb30ce05 to your computer and use it in GitHub Desktop.
A simple non-compliant XML parser implementation
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string>
#include <vector>
#include <set>
#define FORWARD(x) std::forward<decltype((x))>((x))
#define XML_TOKEN_WHITESPACE(x) ((x) == ' ' || (x) == '\t' || (x) == '\n' || (x) == '\r')
#define XML_TOKEN_UPPER_ALPHA(x) ((x) >= 'A' && (x) <= 'Z')
#define XML_TOKEN_LOWER_ALPHA(x) ((x) >= 'a' && (x) <= 'z')
#define XML_TOKEN_ALPHA(x) (XML_TOKEN_UPPER_ALPHA(x) || XML_TOKEN_LOWER_ALPHA(x))
#define XML_TOKEN_DIGIT(x) ((x) >= '0' && (x) <= '9')
#define XML_TOKEN_WORD(x) (XML_TOKEN_ALPHA(x) || XML_TOKEN_DIGIT(x) || (x) == '_')
#define XML_TOKEN_NAME(x) (XML_TOKEN_WORD(x) || (x) == '.' || (x) == '-' || (x) == ':')
#define XML_TOKEN_NAME_BEGIN(x) (XML_TOKEN_ALPHA(x) || (x) == '_' || (x) == ':')
struct XMLToken
{
enum Type
{
Type_None,
Type_EOF,
Type_Whitespace,
Type_Name,
Type_LT,
Type_GT,
Type_EQ,
Type_Colon,
Type_Slash,
Type_Backslash,
Type_Plus,
Type_Minus,
Type_Quote,
Type_DQuote
};
/* The type of the token. */
Type type = Type_None;
/* The offset of the beginning of the token in number of characters. */
size_t beginOffset;
/* The offset of the end of the token, in number of characters. */
size_t endOffset;
inline char const* begin(char const* const markup) const
{
return markup + beginOffset;
}
inline char const* end(char const* const markup) const
{
return markup + endOffset;
}
};
class XMLTokenizer
{
public:
inline XMLTokenizer(char const* xml)
: markup{xml}
, tok{}
, offset{0}
{
// Point to first token
next();
}
inline XMLToken const& operator*() const
{
return tok;
}
inline XMLToken const* operator->() const
{
return &(**this);
}
inline char const* begin() const
{
return tok.begin(markup);
}
inline char const* end() const
{
return tok.end(markup);
}
inline XMLTokenizer& operator++()
{
next();
return *this;
}
inline XMLTokenizer operator++(int)
{
XMLTokenizer other{*this};
++(*this);
return other;
}
inline bool read(XMLToken::Type type, bool const skipWhitespace = false)
{
while (skipWhitespace && tok.type == XMLToken::Type_Whitespace)
next();
if (tok.type == type)
{
next();
return true;
}
return false;
}
protected:
/* The XML markup string. */
char const* markup;
/* The current token. */
XMLToken tok;
/* Current offset in the markup. */
size_t offset;
void next()
{
// Reset token type
tok.type = XMLToken::Type_None;
while (tok.type == XMLToken::Type_None)
{
tok.beginOffset = offset;
tok.endOffset = offset + 1;
switch (markup[offset])
{
case '\0':
tok.type = XMLToken::Type_EOF;
break;
case '<':
tok.type = XMLToken::Type_LT;
break;
case '>':
tok.type = XMLToken::Type_GT;
break;
case '=':
tok.type = XMLToken::Type_EQ;
break;
case '/':
tok.type = XMLToken::Type_Slash;
break;
case '\\':
tok.type = XMLToken::Type_Backslash;
break;
case '+':
tok.type = XMLToken::Type_Plus;
break;
case '-':
tok.type = XMLToken::Type_Minus;
break;
case '\'':
tok.type = XMLToken::Type_Quote;
break;
case '"':
tok.type = XMLToken::Type_DQuote;
break;
default:
{
if (XML_TOKEN_WHITESPACE(markup[offset]))
{
tok.type = XMLToken::Type_Whitespace;
for (; XML_TOKEN_WHITESPACE(markup[tok.endOffset]); tok.endOffset++);
break;
}
if (XML_TOKEN_NAME_BEGIN(markup[offset]))
{
tok.type = XMLToken::Type_Name;
for (; XML_TOKEN_NAME(markup[tok.endOffset]); tok.endOffset++);
break;
}
}
break;
}
// Adjust offset
offset = tok.endOffset;
}
}
};
template<typename VisitorT>
static bool xmlParse_ElementAttr(XMLTokenizer& tokenizer, [[maybe_unused]]VisitorT&& visitor)
{
std::string attrName, attrVal;
if (!tokenizer.read(XMLToken::Type_Whitespace))
return false;
if (tokenizer->type != XMLToken::Type_Name)
return false;
attrName = {tokenizer.begin(), tokenizer.end()};
tokenizer++;
if (tokenizer.read(XMLToken::Type_EQ))
{
if (tokenizer->type != XMLToken::Type_Quote && tokenizer->type != XMLToken::Type_DQuote)
return false;
auto quoteType = tokenizer->type;
tokenizer++;
char const* beginVal = tokenizer.begin();
bool escape = false;
while ((tokenizer->type != quoteType || escape) && tokenizer->type != XMLToken::Type_EOF)
{
escape = tokenizer->type == XMLToken::Type_Backslash;
tokenizer++;
}
char const* endVal = tokenizer.begin();
if (!tokenizer.read(quoteType))
return false;
attrVal = {beginVal, endVal};
}
visitor.elementAttr(std::move(attrName), std::move(attrVal));
return true;
}
template<typename VisitorT>
static bool xmlParse_ElementAttrs(XMLTokenizer& tokenizer, [[maybe_unused]]VisitorT&& visitor)
{
for (XMLTokenizer checkpoint{tokenizer}; xmlParse_ElementAttr(checkpoint, FORWARD(visitor));)
{
tokenizer = checkpoint;
}
return true;
}
template<typename VisitorT>
static bool xmlParse_ElementContent(XMLTokenizer& tokenizer, VisitorT&& visitor)
{
while (tokenizer->type != XMLToken::Type_LT)
tokenizer++;
// TODO: Save text content
for (XMLTokenizer checkpoint{tokenizer}; xmlParse_Element(checkpoint, FORWARD(visitor));)
{
tokenizer = checkpoint;
}
return true;
}
template<typename VisitorT>
static bool xmlParse_Element(XMLTokenizer& tokenizer, VisitorT&& visitor)
{
if (!tokenizer.read(XMLToken::Type_LT, true))
return false;
if (tokenizer->type != XMLToken::Type_Name)
return false;
std::string tag = {tokenizer.begin(), tokenizer.end()};
visitor.elementBegin(tag);
tokenizer++;
if (!xmlParse_ElementAttrs(tokenizer, FORWARD(visitor)))
return false;
if (tokenizer.read(XMLToken::Type_Slash, true) && tokenizer.read(XMLToken::Type_GT))
;
else
{
if (!tokenizer.read(XMLToken::Type_GT, true))
return false;
if (!xmlParse_ElementContent(tokenizer, FORWARD(visitor)))
return false;
if (!(tokenizer.read(XMLToken::Type_LT, true) && tokenizer.read(XMLToken::Type_Slash)))
return false;
if (tokenizer->type != XMLToken::Type_Name || !std::equal(tag.begin(), tag.end(), tokenizer.begin()))
return false;
tokenizer++;
if (!tokenizer.read(XMLToken::Type_GT, true))
return false;
}
visitor.elementEnd(tag);
return true;
}
template<typename VisitorT>
static void xmlParse_Document(XMLTokenizer& tokenizer, VisitorT&& visitor)
{
xmlParse_Element(tokenizer, FORWARD(visitor));
}
template<typename VisitorT>
static void xmlParse(char const* markup, VisitorT&& visitor)
{
XMLTokenizer tokenizer{markup};
xmlParse_Document(tokenizer, FORWARD(visitor));
}
static char const sample[] =
"<data>"
" <country name=\"Liechtenstein\">"
" <rank>1</rank>"
" <year>2008</year>"
" <gdppc>141100</gdppc>"
" <neighbor name=\"Austria\" direction=\"E\"/>"
" <neighbor name=\"Switzerland\" direction=\"W\"/>"
" </country>"
" <country name=\"Singapore\">"
" <rank>4</rank>"
" <year>2011</year>"
" <gdppc>59900</gdppc>"
" <neighbor name=\"Malaysia\" direction=\"N\"/>"
" </country>"
" <country name=\"Panama\">"
" <rank>68</rank>"
" <year>2011</year>"
" <gdppc>13600</gdppc>"
" <neighbor name=\"Costa Rica\" direction=\"W\"/>"
" <neighbor name=\"Colombia\" direction=\"E\"/>"
" </country>"
"</data>";
int main()
{
struct MaterialXMLParser
{
void elementBegin(std::string const& tag)
{
printf("[%s", tag.c_str());
}
void elementAttr(std::string&& name, std::string&& val)
{
printf(" %s=%s", name.c_str(), val.c_str());
}
void elementEnd(std::string const& tag)
{
printf("]");
}
};
xmlParse(sample, MaterialXMLParser{});
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment