Skip to content

Instantly share code, notes, and snippets.

@ktf
Created April 15, 2010 16:52
Show Gist options
  • Save ktf/367350 to your computer and use it in GitHub Desktop.
Save ktf/367350 to your computer and use it in GitHub Desktop.
Simple SAX like parser for xml
/* A simple SAX-like parser.
And yes, I know the S in SAX stands for Simple.
Copyright 2010 Giulio Eulisse. All rights reserved.
Licensed under GPLv3 license.
TODO: incomplete support for entities.
TODO: no support for DTD nor <?xml> preamble.
*/
#include <string>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cstring>
#include <iostream>
#include <algorithm>
bool
fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators,
int *firstChar);
/** A simple SAX parser which is able to parse the configuration.
State machine for the parser can be drawn by cut and pasting the following
to graphviz:
digraph {
IN_DOCUMENT->IN_BEGIN_TAG [label="nextChar == '<'"];
IN_DOCUMENT->IN_DATA [label="nextChar != '<'"];
IN_BEGIN_TAG->IN_BEGIN_ELEMENT [label="nextChar >= 'a' && nextChar < 'Z'"];
IN_BEGIN_TAG->IN_END_ELEMENT [label= "nextChar == '/'"];
IN_BEGIN_ELEMENT->IN_END_ELEMENT [label="nextChar == '/'"];
IN_BEGIN_ELEMENT->IN_ELEMENT_WHITESPACE [label="nextChar == ' '"];
IN_BEGIN_ELEMENT->IN_END_TAG [label="nextChar == '>'"];
IN_ELEMENT_WHITESPACE->IN_ELEMENT_WHITESPACE [ label = "nextChar == \"\\ \\t\\n\""]
IN_ELEMENT_WHITESPACE->IN_ATTRIBUTE_KEY [ label = "nextChar >= 'a' && nextChar < 'Z'"]
IN_ELEMENT_WHITESPACE->IN_END_ELEMENT [label="nextChar == '/'"]
IN_END_ELEMENT->IN_END_TAG [label = "nextChar == '>'"];
IN_END_TAG->IN_BEGIN_TAG [label="nextChar == '<'"];
IN_END_TAG->IN_DATA [label="nextChar != '<'"]
IN_DATA->IN_BEGIN_TAG [label="nextChar == '<'"];
IN_DATA->IN_DATA_ENTITY [label="nextChar == '&'"];
IN_DATA->IN_DONE [label = "nextChar == EOF"];
IN_DATA_ENTITY->IN_DATA [label="nextChar == ';'"];
IN_ATTRIBUTE_KEY->IN_BEGIN_ATTRIBUTE_VALUE [label = "nextChar == '='"]
IN_BEGIN_ATTRIBUTE_VALUE->IN_STRING [label = "nextChar == '\"' || nextChar == '\'' "]
IN_STRING->IN_END_ATTRIBUTE_VALUE [label = "nextChar == quote"]
IN_STRING->IN_STRING_ENTITY [label = "nextChar == '&'"]
IN_END_ATTRIBUTE_VALUE->IN_ELEMENT_WHITESPACE [label = "nextChar == ' '"]
IN_END_ATTRIBUTE_VALUE->IN_END_ELEMENT [label = "nextChar == '/'"]
IN_END_ATTRIBUTE_VALUE->IN_END_TAG [label = "nextChar == '>'"]
IN_STRING_ENTITY->IN_STRING [label = "nextChar == ';'"]
}
*/
class SimpleSAXParser
{
public:
struct Attribute
{
std::string key;
std::string value;
Attribute(const std::string &iKey, const std::string &iValue)
:key(iKey), value(iValue)
{}
Attribute(const Attribute &attr)
:key(attr.key), value(attr.value)
{}
bool operator<(const Attribute &attribute) const
{
return this->key < attribute.key;
}
};
typedef std::vector<Attribute> Attributes;
class ParserError
{
public:
ParserError(const std::string &error)
:m_error(error)
{}
const char *error() { return m_error.c_str(); }
private:
std::string m_error;
};
enum PARSER_STATES {
IN_DOCUMENT,
IN_BEGIN_TAG,
IN_DONE,
IN_BEGIN_ELEMENT,
IN_ELEMENT_WHITESPACE,
IN_END_ELEMENT,
IN_ATTRIBUTE_KEY,
IN_END_TAG,
IN_DATA,
IN_BEGIN_ATTRIBUTE_VALUE,
IN_STRING,
IN_END_ATTRIBUTE_VALUE,
IN_STRING_ENTITY,
IN_DATA_ENTITY
};
SimpleSAXParser(std::istream &f)
: m_in(f),
m_bufferSize(1024),
m_buffer((char*) malloc(m_bufferSize)),
m_nextChar(m_in.get())
{
}
virtual ~SimpleSAXParser();
void parse(void);
virtual void startElement(const std::string &/*name*/,
Attributes &/*attributes*/) {}
virtual void endElement(const std::string &/*name*/) {}
virtual void data(const std::string &/*data*/) {}
private:
std::string parseEntity(const std::string &entity);
std::string getToken(const char *delim)
{
fgettoken(m_in, &m_buffer, &m_bufferSize, delim, &m_nextChar);
return m_buffer;
}
std::string getToken(const char delim)
{
char buf[2] = {delim, 0};
fgettoken(m_in, &m_buffer, &m_bufferSize, buf, &m_nextChar);
m_nextChar = m_in.get();
return m_buffer;
}
bool skipChar(int c)
{
if (m_nextChar != c)
return false;
m_nextChar = m_in.get();
return true;
}
int nextChar(void) { return m_nextChar; }
std::istream &m_in;
size_t m_bufferSize;
char *m_buffer;
int m_nextChar;
std::vector<std::string> m_elementTags;
Attributes m_attributes;
};
// NOTE: put in a .cc if this file is used in more than one place.
/** Helper function to handle entities, i.e. characters specified with
the "&label;" syntax.
*/
std::string
SimpleSAXParser::parseEntity(const std::string &entity)
{
if (entity == "quot")
return "\"";
else if (entity == "amp")
return "&";
else if (entity == "lt")
return "<";
else if (entity == "gt")
return ">";
throw ParserError("Unknown entity " + entity);
}
void
debug_state_machine(enum SimpleSAXParser::PARSER_STATES state)
{
#ifdef SIMPLE_SAX_PARSER_DEBUG
static char *debug_states[] = {
"IN_DOCUMENT",
"IN_BEGIN_TAG",
"IN_DONE",
"IN_BEGIN_ELEMENT",
"IN_ELEMENT_WHITESPACE",
"IN_END_ELEMENT",
"IN_ATTRIBUTE_KEY",
"IN_END_TAG",
"IN_DATA",
"IN_BEGIN_ATTRIBUTE_VALUE",
"IN_STRING",
"IN_END_ATTRIBUTE_VALUE",
"IN_STRING_ENTITY",
"IN_DATA_ENTITY"
};
std::cerr << debug_states[state] << std::endl;
#endif
}
/** Runs the state machine of the parser, invoking startElement(),
setAttribute(), endElement(), data() virtual methods as approppriate.
In order have the parser doing something usefull you need to derive from
it and specialize the above mentioned virtual methods.
Default implementation is in any case useful to check syntax.
*/
void
SimpleSAXParser::parse(void)
{
enum PARSER_STATES state = IN_DOCUMENT;
// Current delimiters for strings in attributes.
char stringDelims[] = "\"&";
std::string attributeName;
std::string attributeValue;
std::string tmp;
std::string currentData;
while (state != IN_DONE)
{
debug_state_machine(state);
switch(state)
{
// FIXME: IN_DOCUMENT should check the dtd...
case IN_DOCUMENT:
state = IN_DATA;
if (skipChar('<'))
state = IN_BEGIN_TAG;
break;
case IN_BEGIN_TAG:
if (nextChar() >= 'A' && nextChar() <= 'z')
state = IN_BEGIN_ELEMENT;
else if (skipChar('/'))
state = IN_END_ELEMENT;
else
throw ParserError("Bad tag");
break;
case IN_BEGIN_ELEMENT:
m_attributes.clear();
m_elementTags.push_back(getToken(" />"));
if (nextChar() == ' ')
state = IN_ELEMENT_WHITESPACE;
else if (skipChar('/'))
state = IN_END_ELEMENT;
else if (skipChar('>'))
{
startElement(m_elementTags.back(), m_attributes);
state = IN_END_TAG;
}
else
throw ParserError("Bad element.");
break;
case IN_ELEMENT_WHITESPACE:
while(skipChar(' ') || skipChar('\n') || skipChar('\t'))
{}
if (nextChar() >= 'A' && nextChar() <= 'z')
state=IN_ATTRIBUTE_KEY;
else if (nextChar() == '/')
state = IN_END_ELEMENT;
else
throw ParserError("Syntax error in element" + m_elementTags.back());
break;
case IN_ATTRIBUTE_KEY:
attributeName = getToken('=');
state = IN_BEGIN_ATTRIBUTE_VALUE;
break;
case IN_BEGIN_ATTRIBUTE_VALUE:
if (skipChar('"'))
{
state = IN_STRING;
attributeValue.clear();
stringDelims[0] = '\"';
}
else if (skipChar('\''))
{
state = IN_STRING;
attributeValue.clear();
stringDelims[0] = '\'';
}
else
throw ParserError("Expecting quotes.");
break;
case IN_STRING:
attributeValue += getToken(stringDelims);
if (skipChar(stringDelims[0]))
{
// Save the attributes in order, replacing those that are
// specified more than once.
Attribute attr(attributeName, attributeValue);
Attributes::iterator i = std::lower_bound(m_attributes.begin(),
m_attributes.end(),
attr);
if (i != m_attributes.end() && i->key == attr.key)
throw ParserError("Attribute " + i->key + " defined more than once");
m_attributes.insert(i, attr);
state = IN_END_ATTRIBUTE_VALUE;
}
else if (skipChar(stringDelims[1]))
state = IN_STRING_ENTITY;
else
throw ParserError("Unexpected end of input at " + attributeValue);
break;
case IN_END_ATTRIBUTE_VALUE:
getToken(" />");
if (nextChar() == ' ')
state = IN_ELEMENT_WHITESPACE;
else if (skipChar('/'))
state = IN_END_ELEMENT;
else if (skipChar('>'))
{
startElement(m_elementTags.back(), m_attributes);
state = IN_END_TAG;
}
break;
case IN_END_ELEMENT:
tmp = getToken('>');
if (!tmp.empty() && tmp != m_elementTags.back())
throw ParserError("Non-matching closing element "
+ tmp + " for " + attributeValue);
endElement(tmp);
m_elementTags.pop_back();
state = IN_END_TAG;
break;
case IN_END_TAG:
if (nextChar() == EOF)
return;
else if (skipChar('<'))
state = IN_BEGIN_TAG;
else
state = IN_DATA;
break;
case IN_DATA:
currentData += getToken("<&");
if (skipChar('&'))
state = IN_DATA_ENTITY;
else if (skipChar('<'))
{
data(currentData);
currentData.clear();
state = IN_BEGIN_TAG;
}
else if (nextChar() == EOF)
{
data(currentData);
return;
}
else
throw ParserError("Unexpected end of input in element " + m_elementTags.back() + currentData);
break;
case IN_DATA_ENTITY:
currentData += parseEntity(getToken(';'));
state = IN_DATA;
break;
case IN_STRING_ENTITY:
attributeValue += parseEntity(getToken(';'));
state = IN_STRING;
break;
case IN_DONE:
return;
}
}
}
SimpleSAXParser::~SimpleSAXParser() {}
/** Helper function which gets a token delimited by @a separator from the
@a file and write it, 0 terminated in the buffer found in @a buffer.
Notice that if the token is larger than @a maxSize, the buffer is
reallocated and @a maxSize is updated to the new size.
The trailing separator after a token is not put in the token and is left
in the buffer. If @a nextChar is not 0, the delimiter is put there.
@a in the input stream to be parsed.
@a buffer a pointer to the buffer where to put the tokens. The buffer will
be redimensioned accordingly, if the token is larger of the buffer.
@a maxSize, a pointer to the size of the buffer. Notice that in case the
buffer is reallocated to have more space, maxSize is updated with the new
size.
@a firstChar a pointer with the first character in the buffer, notice
that the first charater in the stream must be obtained
separately!!!
@return whether or not we were able to get a (possibly empty) token from
the file.
*/
bool
fgettoken(std::istream &in, char **buffer, size_t *maxSize, const char *separators,
int *firstChar)
{
// if the passed first character is EOF or a separator,
// return an empty otherwise use it as first character
// of the buffer.
if (*firstChar == EOF || (int) separators[0] == *firstChar || strchr(separators + 1, *firstChar))
{
(*buffer)[0] = 0;
return true;
}
else
(*buffer)[0] = (char) *firstChar;
size_t i = 1;
while (true)
{
if (i >= *maxSize)
{
*maxSize += 1024;
*buffer = (char*) realloc(*buffer, *maxSize);
if (!*buffer)
return false;
}
int c = in.get();
if (c == EOF)
{
(*buffer)[i] = 0;
*firstChar = c;
return false;
}
if (separators[0] == c || strchr(separators + 1, c))
{
(*buffer)[i] = 0;
*firstChar = c;
return true;
}
(*buffer)[i++] = (char) c;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment