Skip to content

Instantly share code, notes, and snippets.

@cpdt
Last active February 6, 2020 11:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cpdt/5a05c3db8b7a91961b506136fd1f77b0 to your computer and use it in GitHub Desktop.
Save cpdt/5a05c3db8b7a91961b506136fd1f77b0 to your computer and use it in GitHub Desktop.
A simple and pretty fast "flat INI" parser - for when you just need to read a list of key/value pairs that can have escape sequences
#include "flatparser.h"
// Parsing utils
static bool is_whitespace(char c) {
return c == ' ' || c == '\t';
}
static bool is_end_of_line(const char *text) {
return *text == 0 || *text == '\n' || (text[0] == '\r' && text[1] == '\n');
}
static const char* skip_whitespace(const char *text) {
while (is_whitespace(*text)) {
text++;
}
return text;
}
static const char* find_end_of_line(const char *text) {
while (!is_end_of_line(text)) {
text++;
}
return text;
}
static bool read_hex(const char* text, size_t num_digits, unsigned& out) {
out = 0;
for (size_t digit_index = 0; digit_index < num_digits; digit_index++) {
char character = text[digit_index];
int character_value;
if (character >= '0' && character <= '9') {
character_value = character - '0';
} else if (character >= 'A' && character <= 'F') {
character_value = 10 + character - 'A';
} else if (character >= 'a' && character <= 'f') {
character_value = 10 + character - 'a';
} else {
return false;
}
out <<= 4;
out += character_value;
}
return true;
}
static void write_utf8_char(unsigned ch, std::string& out) {
// Convert a unicode character to a UTF8 sequence
// This is a direct translation of the table at https://en.wikipedia.org/wiki/UTF-8#description
if (ch < (1 << 7)) {
out.push_back(static_cast<char>(ch));
} else if (ch < (1 << 11)) {
out.push_back(static_cast<char>(0b11000000 + ((ch & 0b000000000011111000000) >> 6)));
out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111)));
} else if (ch < (1 << 16)) {
out.push_back(static_cast<char>(0b11100000 + ((ch & 0b000001111000000000000) >> 12)));
out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6)));
out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111)));
} else {
out.push_back(static_cast<char>(0b11110000 + ((ch & 0b111000000000000000000) >> 18)));
out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000111111000000000000) >> 12)));
out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6)));
out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111)));
}
}
static const char* process_escape_sequence(const char* text, std::string& out) {
auto id_char = *text;
text++;
switch (id_char) {
case 0:
return text;
case 'b':
out.push_back('\b');
break;
case 'f':
out.push_back('\f');
break;
case 'n':
out.push_back('\n');
break;
case 'r':
out.push_back('\r');
break;
case 't':
out.push_back('\t');
break;
case '\n':
// Line continuation: produce no characters but consume the newline
break;
case 'x': {
// Hexadecimal escape sequence - the next 2 characters are the character in base 16
unsigned char_code;
if (!read_hex(text, 2, char_code)) {
break;
}
write_utf8_char(char_code, out);
text += 2;
break;
}
case 'u': {
// Unicode escape sequence - the next 4 characters are the character in base 16
unsigned char_code;
if (!read_hex(text, 4, char_code)) {
break;
}
write_utf8_char(char_code, out);
text += 4;
break;
}
default:
out.push_back(id_char);
break;
}
return text;
}
static const char* read_escapable(const char* text, std::string& out, bool (&is_ended)(const char*)) {
while (true) {
// Find the next chunk delimiter
const char* chunk_start = text;
while (*text != '\\' && !is_end_of_line(text) && !is_ended(text)) {
text++;
}
out.append(chunk_start, static_cast<size_t>(text - chunk_start));
if (*text != '\\') {
break;
}
text = process_escape_sequence(text + 1, out);
}
return text;
}
static bool has_key_ended(const char* text) {
return *skip_whitespace(text) == '=';
}
static bool has_quoted_value_ended(const char* text) {
return *text == '"';
}
static bool has_unqoted_value_ended(const char* text) {
const char* after_ws = skip_whitespace(text);
return is_end_of_line(after_ws);
}
static const char* parse_line(const char* text, OnRowCb& cb, void* ctx) {
// Skip whitespace that might be at the start of a line
text = skip_whitespace(text);
// There are three types of lines:
// ; Single line comments - we can ignore everything until the next line
// Quoted="key value pairs" - after we extract the key and value contents, escape codes need to be replaced
// Unquoted=key value pairs - like the quoted ones but without quotes
if (*text == ';') {
// This line is a comment - don't bother doing anything with it
return text;
}
// Read the key, handling possible escape sequences.
std::string key;
text = read_escapable(text, key, has_key_ended);
// There might be whitespace left between the key and the =
text = skip_whitespace(text);
if (*text != '=') {
// The key didn't end on what we expected, so this is either a newline or EOF.
// In either case, it's an invalid line, we can't do anything more with it
return text;
}
// Skip any whitespace that might be after the = but before the " to start the value
text = skip_whitespace(text + 1);
std::string value;
if (*text == '"') {
text = read_escapable(text + 1, value, has_quoted_value_ended);
} else {
text = read_escapable(text, value, has_unqoted_value_ended);
}
// The line is over - commit the data and go to the next line
cb(ctx, std::move(key), std::move(value));
return text;
}
void parse_flat_ini(const char *text, OnRowCb& cb, void* ctx) {
while (*text != 0) {
text = parse_line(text, cb, ctx);
text = find_end_of_line(text);
if (*text != 0) {
text++;
}
}
}
#pragma once
#include <string>
using OnRowCb = void (void* ctx, std::string&& key, std::string&& value);
void parse_flat_ini(const char* text, OnRowCb& cb, void* ctx);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment