-
-
Save jcheng5/5d0cd021d5109631b6f2 to your computer and use it in GitHub Desktop.
sql string tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "sqldelim.hpp" | |
#include <assert.h> | |
#include <iostream> | |
QuoteSpec::QuoteSpec(int tag, char startChar, char endChar, char escapeChar, char doubleEscape) : | |
tag(tag), startChar(startChar), endChar(endChar), escapeChar(escapeChar), doubleEscape(doubleEscape) { | |
} | |
CommentSpec::CommentSpec(int tag, const std::string& startStr, const std::string& endStr, bool endStrRequired) : | |
tag(tag), startStr(startStr), endStr(endStr), endStrRequired(endStrRequired) { | |
} | |
namespace { | |
template <class T> | |
bool hasPrefix(T begin, T end, const std::string& prefix) { | |
if (prefix.size() == 0) { | |
return false; | |
} | |
if (end - begin < prefix.size()) { | |
return false; | |
} | |
for (size_t i = 0; i < prefix.size(); i++) { | |
if (*(begin + i) != prefix[i]) { | |
return false; | |
} | |
} | |
return true; | |
} | |
template <class T> | |
bool findQuoteEnd(T begin, T end, const QuoteSpec& quoteSpec, T *regionEnd) { | |
T pos = begin; | |
// Assert that the quoted region starts with the character we | |
// expect. If not, we were called in error. | |
assert(*pos == quoteSpec.startChar); | |
pos++; | |
const int STATE_NORMAL = 0; | |
const int STATE_ESCAPE = 1; | |
const int STATE_DOUBLE = 2; | |
int state = STATE_NORMAL; | |
for (; pos != end; pos++) { | |
switch (state) { | |
case STATE_NORMAL: | |
if (*pos == quoteSpec.endChar) { | |
if (quoteSpec.doubleEscape) { | |
state = STATE_DOUBLE; | |
} else { | |
*regionEnd = pos + 1; | |
return true; | |
} | |
} else if (*pos == quoteSpec.escapeChar) { | |
state = STATE_ESCAPE; | |
} | |
break; | |
case STATE_ESCAPE: | |
state = STATE_NORMAL; | |
break; | |
case STATE_DOUBLE: | |
if (*pos == quoteSpec.endChar) { | |
state = STATE_NORMAL; | |
} else { | |
*regionEnd = pos; | |
return true; | |
} | |
break; | |
default: | |
assert(false); | |
return false; | |
} | |
} | |
if (state == STATE_DOUBLE) { | |
*regionEnd = end; | |
return true; | |
} | |
return false; | |
} | |
template <class T> | |
bool findCommentEnd(T begin, T end, const CommentSpec& commentSpec, T *regionEnd) { | |
for (; begin != end; begin++) { | |
if (hasPrefix(begin, end, commentSpec.endStr)) { | |
*regionEnd = begin + commentSpec.endStr.size(); | |
return true; | |
} | |
} | |
if (commentSpec.endStrRequired) { | |
return false; | |
} else { | |
*regionEnd = end; | |
return true; | |
} | |
} | |
size_t whichQuote(char c, const QuoteSpecs& quoteSpecs) { | |
for (size_t i = 0; i < quoteSpecs.size(); i++) { | |
if (c == quoteSpecs[i].startChar) { | |
return i; | |
} | |
} | |
return -1; | |
} | |
template <class T> | |
size_t whichComment(T begin, T end, const CommentSpecs& commentSpecs) { | |
for (size_t i = 0; i < commentSpecs.size(); i++) { | |
if (hasPrefix(begin, end, commentSpecs[i].startStr)) { | |
return i; | |
} | |
} | |
return -1; | |
} | |
} | |
ParseResult parseQuery(const std::string& query, const QuoteSpecs& quoteSpecs, | |
const CommentSpecs& commentSpecs) { | |
Regions regions; | |
for (std::string::const_iterator it = query.begin(); | |
it != query.end(); | |
it++) { | |
// Check each character to see if it starts a quoted region | |
size_t qi = whichQuote(*it, quoteSpecs); | |
if (qi != -1) { | |
// It's a quoted region! Find where it ends | |
std::string::const_iterator regionEnd; | |
if (!findQuoteEnd(it, query.end(), quoteSpecs[qi], ®ionEnd)) { | |
// Report error and exit | |
return ParseResult("Unterminated literal", it - query.begin()); | |
} else { | |
Region region; | |
region.tag = quoteSpecs[qi].tag; | |
region.startOffset = it - query.begin(); | |
region.length = regionEnd - it; | |
regions.push_back(region); | |
it += region.length - 1; | |
} | |
} | |
size_t ci = whichComment(it, query.end(), commentSpecs); | |
if (ci != -1) { | |
// It's a comment; find where it ends | |
std::string::const_iterator regionEnd; | |
if (!findCommentEnd(it, query.end(), commentSpecs[ci], ®ionEnd)) { | |
// Report error and exit | |
return ParseResult("Unterminated comment", it - query.begin()); | |
} else { | |
Region region; | |
region.tag = commentSpecs[ci].tag; | |
region.startOffset = it - query.begin(); | |
region.length = regionEnd - it; | |
regions.push_back(region); | |
it += region.length - 1; | |
} | |
} | |
} | |
return ParseResult(regions); | |
} | |
int main() { | |
QuoteSpecs quoteSpecs; | |
quoteSpecs.push_back(QuoteSpec(1, '\'', '\'', '\\', true)); | |
quoteSpecs.push_back(QuoteSpec(2, '"', '"', '\\', true)); | |
CommentSpecs commentSpecs; | |
commentSpecs.push_back(CommentSpec(3, "/*", "*/", true)); | |
commentSpecs.push_back(CommentSpec(4, "--", "\n", false)); | |
commentSpecs.push_back(CommentSpec(5, "#", "\n", false)); | |
while (true) { | |
std::string query; | |
std::cout << "Query: "; | |
std::getline(std::cin, query); | |
if (query.size() == 0) { | |
return 0; | |
} | |
ParseResult result = parseQuery(query, quoteSpecs, commentSpecs); | |
if (!result.success) { | |
std::cout << "ERROR: " << result.errorMessage << " : " << result.errorOffset << "\n"; | |
} else { | |
Regions regions = result.regions; | |
for (Regions::const_iterator it = regions.begin(); | |
it != regions.end(); | |
it++) { | |
std::cout << "Region: " << query.substr(it->startOffset, it->length) << "\n"; | |
} | |
} | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <vector> | |
struct QuoteSpec { | |
int tag; | |
char startChar; | |
char endChar; | |
char escapeChar; | |
bool doubleEscape; | |
QuoteSpec(int tag, char startChar, char endChar, char escapeChar, char doubleEscape); | |
}; | |
struct CommentSpec { | |
int tag; | |
std::string startStr; | |
std::string endStr; | |
bool endStrRequired; | |
CommentSpec(int tag, const std::string& startStr, const std::string& endStr, bool endStrRequired); | |
}; | |
struct Region { | |
int tag; | |
int startOffset; | |
int length; | |
}; | |
typedef std::vector<QuoteSpec> QuoteSpecs; | |
typedef std::vector<CommentSpec> CommentSpecs; | |
typedef std::vector<Region> Regions; | |
struct ParseResult { | |
bool success; | |
Regions regions; | |
std::string errorMessage; | |
size_t errorOffset; | |
ParseResult(const std::string& error, size_t pos) : | |
success(false), errorMessage(error), errorOffset(pos) { | |
} | |
ParseResult(const Regions& regions) : | |
success(true), regions(regions) { | |
} | |
}; | |
ParseResult parseQuery(const std::string& query, const QuoteSpecs& quoteSpecs); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment