Skip to content

Instantly share code, notes, and snippets.

@klemens-morgenstern
Created June 8, 2019 07:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klemens-morgenstern/cd3b1ea42e0d8438b6ae11a31cf3d268 to your computer and use it in GitHub Desktop.
Save klemens-morgenstern/cd3b1ea42e0d8438b6ae11a31cf3d268 to your computer and use it in GitHub Desktop.
ctlex
#ifndef CTLEX_HPP
#define CTLEX_HPP
#include <ctre.hpp>
#include <string_view>
#include <algorithm>
#include <stdexcept>
namespace ctlex
{
class lexer_error : public std::runtime_error
{
public:
using std::runtime_error::runtime_error;
lexer_error(const std::string_view & sv) : lexer_error(std::string(sv.begin(), sv.end())) {}
};
template<std::size_t SizeId, std::size_t SizeRegex>
struct token
{
ctll::fixed_string<SizeId> id;
ctll::fixed_string<SizeRegex> regex;
bool ignore = false;
constexpr std::u32string_view id_view() const {return {id.begin(), id.size()};}
constexpr token(const token & tk) noexcept : id(tk.id), regex(tk.regex), ignore(tk.ignore) {}
constexpr token& operator=(const token & tk) noexcept
{
id = tk.id;
regex = tk.regex;
ignore= tk.ignore;
return *this;
}
template<typename IdChar, typename RegexChar>
constexpr token(const IdChar (&id)[SizeId], const RegexChar (&regex)[SizeRegex]) noexcept : regex(regex), id(id)
{
}
template<typename IdChar, typename RegexChar>
constexpr token(const IdChar (&id)[SizeId], const RegexChar (&regex)[SizeRegex], decltype(std::ignore)) noexcept : regex(regex), id(id), ignore(true)
{
}
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex>
constexpr bool operator<(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->id < rhs.id;}
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex>
constexpr bool operator==(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->id == rhs.id;}
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex>
constexpr bool operator>(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->id > id;}
};
struct invalid_token_tag{};
template<typename char_type>
struct token_result
{
std::u32string_view id;
std::basic_string_view<char_type> value;
template<ctll::fixed_string Id> constexpr bool is() const {return id == std::u32string_view(Id.begin(), Id.size());}
template<std::size_t Size> constexpr bool is(const ctll::fixed_string<Size> & id) const {return this->id == std::u32string_view(id.begin(), id.size());}
constexpr token_result() noexcept {};
constexpr token_result(std::u32string_view id, std::basic_string_view<char_type> value) noexcept : id(id), value(value)
{
}
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex>
constexpr bool operator<(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->value.begin() < rhs.value.begin();}
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex>
constexpr bool operator==(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->value.begin() == rhs.value.begin();}
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex>
constexpr bool operator>(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->value.begin() > value.begin();}
};
template<token ... Tokens>
constexpr inline auto build_regex()
{
//the way we build it: (regex1)|(regex2)|(regex3) -> meaning SumSize + SizeOfElem * 2 + (SizeOfElem - 1) + '$'
constexpr auto SumSize = ( 0 + ... + Tokens.regex.size());
constexpr auto SizeOfElem = sizeof...(Tokens);
char32_t res[SumSize + (SizeOfElem * 4) - 1] = {};
constexpr std::array<std::u32string_view, SizeOfElem> input = { std::u32string_view(Tokens.regex.begin(), Tokens.regex.size())... };
auto idx = 0u;
res[idx++] = U'^';
res[idx++] = U'(';
for (auto & in : input)
{
//copy should be constexpr, isn't yet though...sooo, second loop aye
//std::copy(std::begin(in), std::end(in), &res[idx]);
for (auto idx_ = 0u; idx_ < in.size(); idx_ ++)
res[idx++] = in[idx_];
if (&in == (input.end() - 1))
break;
res[idx++] = U')';
res[idx++] = U'|';
res[idx++] = U'^';
res[idx++] = U'(';
}
res[idx] = U')';
return ctll::fixed_string<SumSize + (SizeOfElem * 4) - 1>(res);
}
template<auto... Tokens, typename Iterator, typename... Captures>
constexpr auto get_resulting_token(const ctre::regex_results<Iterator, Captures...> & res)
-> std::optional<std::pair<token_result<typename std::iterator_traits<Iterator>::value_type>, bool>>
{
static_assert(sizeof...(Tokens) == sizeof...(Captures));
using char_type = typename std::iterator_traits<Iterator>::value_type;
constexpr auto transform_cap = [](auto && val) constexpr -> std::optional<std::basic_string_view<char_type>>
{
if (val)
return val.to_view();
else
return std::nullopt;
};
std::array<std::optional<std::basic_string_view<char_type>>, sizeof...(Captures)> caps = {transform_cap(res.template get<Captures::template storage<Iterator>::get_id()>())...};
using char_type = typename std::iterator_traits<Iterator>::value_type;
std::array<std::pair<std::u32string_view, bool>, sizeof...(Tokens)> toks = {std::make_pair(Tokens.id_view(), Tokens.ignore)...};
auto idx = 0u;
for (; idx < sizeof...(Captures); idx++)
if (caps[idx])
break;
if (idx == sizeof...(Captures))
return std::nullopt;
auto val = *caps[idx];
return std::make_pair(token_result<char_type>(toks[idx].first, val), toks[idx].second);
}
template<auto Searcher, auto... Tokens, typename Iterator>
constexpr auto get_next_token(Iterator itr, Iterator end)
{
using char_type = typename std::iterator_traits<Iterator>::value_type;
auto match = Searcher(itr, end);
auto res = get_resulting_token<Tokens...>(match);
return res;
}
template<typename Iterator, auto Searcher, auto... Tokens>
struct token_iterator
{
constexpr token_iterator(Iterator begin, Iterator end) : _itr(begin), _end(end)
{
_seek_next();
}
constexpr token_iterator(const token_iterator &) = default;
constexpr token_iterator &operator=(const token_iterator &) = default;
using char_type = typename std::iterator_traits<Iterator>::value_type;
using value_type = token_result<char_type>;
using reference = value_type &;
using pointer = value_type *;
using difference_type = std::size_t;
using iterator_category = std::forward_iterator_tag;
constexpr reference operator*() {return *_current;}
constexpr reference operator*() const {return *_current;}
constexpr auto operator->() {return _current;}
constexpr auto operator->() const {return _current;}
constexpr operator bool() const {return _current.has_value();}
constexpr bool valid() const {return _current.has_value();}
constexpr auto current_position() {return _itr;}
constexpr auto end_position() {return _end;}
constexpr token_iterator& operator++()
{
_itr += _current ? _current->value.size() : 1u;
_seek_next();
return *this;
}
constexpr token_iterator operator++(int)
{
const auto res = *this;
_itr += _current ? _current->value.size() : 1u;
_seek_next();
return res;
}
constexpr bool eoi() const {return _itr == _end;}
constexpr bool operator< (const token_iterator& rhs) {return this->_itr < rhs._itr;}
constexpr bool operator==(const token_iterator& rhs) {return this->_itr == rhs._itr;}
constexpr bool operator> (const token_iterator& rhs) {return this->_itr > rhs._itr;}
constexpr bool operator!=(const token_iterator& rhs) {return this->_itr != rhs._itr;}
private:
constexpr void _seek_next()
{
while (_itr != _end)
{
const auto next_token = get_next_token<Searcher, Tokens...>(_itr, _end);
if (!next_token) //invalid token
{
_current = value_type({},{});
break; //not found
}
if (!next_token->second) //token valid, but not ignored
{
_current = std::optional(next_token->first);
return;
}
else
_itr += next_token->first.value.size();
}
}
std::optional<value_type> _current;
Iterator _itr;
Iterator _end;
};
template<typename Iterator, auto... Tokens>
struct token_range
{
using char_type = typename std::iterator_traits<Iterator>::value_type;
using value_type = token_result<char_type>;
using reference = value_type &;
constexpr static auto searcher = ctre::search<build_regex<Tokens...>()>;
using iterator = token_iterator<Iterator, searcher, Tokens...>;
constexpr token_range(token_range&) = default;
constexpr token_range(Iterator begin, Iterator end) : _begin(begin), _end(end) {}
constexpr iterator begin() const {return iterator(_begin, _end);}
constexpr iterator end() const {return iterator( _end, _end);}
constexpr bool operator==(const token_range& rhs) const {return (_begin == rhs._begin) && (_end == rhs._end);}
constexpr bool operator!=(const token_range& rhs) const {return (_begin != rhs._begin) || (_end != rhs._end);}
private:
Iterator _begin;
Iterator _end;
};
template<token ... Tokens>
constexpr auto tokenize(const std::string_view &sv)
{
using iterator = std::string_view::const_iterator ;
using range = token_range<iterator, Tokens...>;
return range(sv.begin(), sv.end());
}
}
///SOME TESTS, only here because gist..
constexpr auto sv = std::string_view("foo", 4);
constexpr auto tk1 = ctlex::token("foo", "fo+");
constexpr auto tk2 = ctlex::token("bar", "bar");
constexpr auto tk3 = ctlex::token("ws", "[a-zA-Z0-9_]+", std::ignore);
constexpr auto tk_ = tk1;
constexpr ctlex::token tk_cp2 = tk1;
constexpr ctlex::token tk_cp3(tk1);
using tr = ctlex::token_range<char* , tk1, tk2, tk3>;
constexpr auto rx = ctlex::build_regex<tk1, tk2, tk3>();
static_assert(std::u32string_view(rx.begin(), rx.size()) == U"^(fo+)|^(bar)|^([a-zA-Z0-9_]+)");
constexpr auto match = ctre::search<"^(fo+)|^(bar)|^(\\s+)">("foo bar");
static_assert(match. get<1>());
static_assert(match. get<1>().to_view() == "foo");
static_assert(!match.get<2>());
static_assert(!match.get<3>());
constexpr auto match1 = ctre::search<"^(fo+)|^(bar)|^(\\s+)">(" bar");
static_assert(!match1.get<1>());
static_assert(!match1.get<2>());
static_assert( match1.get<3>());
static_assert( match1.get<3>().to_view() == " ");
constexpr auto match2 = ctre::search<"^(fo+)|^(bar)|^(\\s+)">("bar");
static_assert(!match2. get<1>());
static_assert( match2.get<2>());
static_assert( match2.get<2>().to_view() == "bar");
static_assert(!match2.get<3>());
constexpr auto seq = ctlex::tokenize<ctlex::token("foo", "fo+"),
ctlex::token("bar", "bar"),
ctlex::token("ws", "\\s+", std::ignore)>("fooo bar");
static_assert(seq.begin());
constexpr auto start = *seq.begin();
static_assert(start.is<"foo">());
static_assert(start.value == "fooo");
constexpr auto next = std::next(seq.begin());
static_assert(next);
static_assert(next->is<"bar">());
static_assert(next->value == "bar");
constexpr auto eend = std::next(seq.begin(), 2);
static_assert(seq.end() == eend);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment