Created
June 8, 2019 07:45
-
-
Save klemens-morgenstern/cd3b1ea42e0d8438b6ae11a31cf3d268 to your computer and use it in GitHub Desktop.
ctlex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef CTLEX_HPP | |
#define CTLEX_HPP | |
#include <ctre.hpp> | |
#include <string_view> | |
#include <algorithm> | |
#include <stdexcept> | |
namespace ctlex | |
{ | |
class lexer_error : public std::runtime_error | |
{ | |
public: | |
using std::runtime_error::runtime_error; | |
lexer_error(const std::string_view & sv) : lexer_error(std::string(sv.begin(), sv.end())) {} | |
}; | |
template<std::size_t SizeId, std::size_t SizeRegex> | |
struct token | |
{ | |
ctll::fixed_string<SizeId> id; | |
ctll::fixed_string<SizeRegex> regex; | |
bool ignore = false; | |
constexpr std::u32string_view id_view() const {return {id.begin(), id.size()};} | |
constexpr token(const token & tk) noexcept : id(tk.id), regex(tk.regex), ignore(tk.ignore) {} | |
constexpr token& operator=(const token & tk) noexcept | |
{ | |
id = tk.id; | |
regex = tk.regex; | |
ignore= tk.ignore; | |
return *this; | |
} | |
template<typename IdChar, typename RegexChar> | |
constexpr token(const IdChar (&id)[SizeId], const RegexChar (®ex)[SizeRegex]) noexcept : regex(regex), id(id) | |
{ | |
} | |
template<typename IdChar, typename RegexChar> | |
constexpr token(const IdChar (&id)[SizeId], const RegexChar (®ex)[SizeRegex], decltype(std::ignore)) noexcept : regex(regex), id(id), ignore(true) | |
{ | |
} | |
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex> | |
constexpr bool operator<(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->id < rhs.id;} | |
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex> | |
constexpr bool operator==(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->id == rhs.id;} | |
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex> | |
constexpr bool operator>(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->id > id;} | |
}; | |
struct invalid_token_tag{}; | |
template<typename char_type> | |
struct token_result | |
{ | |
std::u32string_view id; | |
std::basic_string_view<char_type> value; | |
template<ctll::fixed_string Id> constexpr bool is() const {return id == std::u32string_view(Id.begin(), Id.size());} | |
template<std::size_t Size> constexpr bool is(const ctll::fixed_string<Size> & id) const {return this->id == std::u32string_view(id.begin(), id.size());} | |
constexpr token_result() noexcept {}; | |
constexpr token_result(std::u32string_view id, std::basic_string_view<char_type> value) noexcept : id(id), value(value) | |
{ | |
} | |
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex> | |
constexpr bool operator<(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->value.begin() < rhs.value.begin();} | |
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex> | |
constexpr bool operator==(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->value.begin() == rhs.value.begin();} | |
template<std::size_t RhsSizeId, std::size_t RhsSizeRegex> | |
constexpr bool operator>(const token<RhsSizeId, RhsSizeRegex> & rhs) {return this->value.begin() > value.begin();} | |
}; | |
template<token ... Tokens> | |
constexpr inline auto build_regex() | |
{ | |
//the way we build it: (regex1)|(regex2)|(regex3) -> meaning SumSize + SizeOfElem * 2 + (SizeOfElem - 1) + '$' | |
constexpr auto SumSize = ( 0 + ... + Tokens.regex.size()); | |
constexpr auto SizeOfElem = sizeof...(Tokens); | |
char32_t res[SumSize + (SizeOfElem * 4) - 1] = {}; | |
constexpr std::array<std::u32string_view, SizeOfElem> input = { std::u32string_view(Tokens.regex.begin(), Tokens.regex.size())... }; | |
auto idx = 0u; | |
res[idx++] = U'^'; | |
res[idx++] = U'('; | |
for (auto & in : input) | |
{ | |
//copy should be constexpr, isn't yet though...sooo, second loop aye | |
//std::copy(std::begin(in), std::end(in), &res[idx]); | |
for (auto idx_ = 0u; idx_ < in.size(); idx_ ++) | |
res[idx++] = in[idx_]; | |
if (&in == (input.end() - 1)) | |
break; | |
res[idx++] = U')'; | |
res[idx++] = U'|'; | |
res[idx++] = U'^'; | |
res[idx++] = U'('; | |
} | |
res[idx] = U')'; | |
return ctll::fixed_string<SumSize + (SizeOfElem * 4) - 1>(res); | |
} | |
template<auto... Tokens, typename Iterator, typename... Captures> | |
constexpr auto get_resulting_token(const ctre::regex_results<Iterator, Captures...> & res) | |
-> std::optional<std::pair<token_result<typename std::iterator_traits<Iterator>::value_type>, bool>> | |
{ | |
static_assert(sizeof...(Tokens) == sizeof...(Captures)); | |
using char_type = typename std::iterator_traits<Iterator>::value_type; | |
constexpr auto transform_cap = [](auto && val) constexpr -> std::optional<std::basic_string_view<char_type>> | |
{ | |
if (val) | |
return val.to_view(); | |
else | |
return std::nullopt; | |
}; | |
std::array<std::optional<std::basic_string_view<char_type>>, sizeof...(Captures)> caps = {transform_cap(res.template get<Captures::template storage<Iterator>::get_id()>())...}; | |
using char_type = typename std::iterator_traits<Iterator>::value_type; | |
std::array<std::pair<std::u32string_view, bool>, sizeof...(Tokens)> toks = {std::make_pair(Tokens.id_view(), Tokens.ignore)...}; | |
auto idx = 0u; | |
for (; idx < sizeof...(Captures); idx++) | |
if (caps[idx]) | |
break; | |
if (idx == sizeof...(Captures)) | |
return std::nullopt; | |
auto val = *caps[idx]; | |
return std::make_pair(token_result<char_type>(toks[idx].first, val), toks[idx].second); | |
} | |
template<auto Searcher, auto... Tokens, typename Iterator> | |
constexpr auto get_next_token(Iterator itr, Iterator end) | |
{ | |
using char_type = typename std::iterator_traits<Iterator>::value_type; | |
auto match = Searcher(itr, end); | |
auto res = get_resulting_token<Tokens...>(match); | |
return res; | |
} | |
template<typename Iterator, auto Searcher, auto... Tokens> | |
struct token_iterator | |
{ | |
constexpr token_iterator(Iterator begin, Iterator end) : _itr(begin), _end(end) | |
{ | |
_seek_next(); | |
} | |
constexpr token_iterator(const token_iterator &) = default; | |
constexpr token_iterator &operator=(const token_iterator &) = default; | |
using char_type = typename std::iterator_traits<Iterator>::value_type; | |
using value_type = token_result<char_type>; | |
using reference = value_type &; | |
using pointer = value_type *; | |
using difference_type = std::size_t; | |
using iterator_category = std::forward_iterator_tag; | |
constexpr reference operator*() {return *_current;} | |
constexpr reference operator*() const {return *_current;} | |
constexpr auto operator->() {return _current;} | |
constexpr auto operator->() const {return _current;} | |
constexpr operator bool() const {return _current.has_value();} | |
constexpr bool valid() const {return _current.has_value();} | |
constexpr auto current_position() {return _itr;} | |
constexpr auto end_position() {return _end;} | |
constexpr token_iterator& operator++() | |
{ | |
_itr += _current ? _current->value.size() : 1u; | |
_seek_next(); | |
return *this; | |
} | |
constexpr token_iterator operator++(int) | |
{ | |
const auto res = *this; | |
_itr += _current ? _current->value.size() : 1u; | |
_seek_next(); | |
return res; | |
} | |
constexpr bool eoi() const {return _itr == _end;} | |
constexpr bool operator< (const token_iterator& rhs) {return this->_itr < rhs._itr;} | |
constexpr bool operator==(const token_iterator& rhs) {return this->_itr == rhs._itr;} | |
constexpr bool operator> (const token_iterator& rhs) {return this->_itr > rhs._itr;} | |
constexpr bool operator!=(const token_iterator& rhs) {return this->_itr != rhs._itr;} | |
private: | |
constexpr void _seek_next() | |
{ | |
while (_itr != _end) | |
{ | |
const auto next_token = get_next_token<Searcher, Tokens...>(_itr, _end); | |
if (!next_token) //invalid token | |
{ | |
_current = value_type({},{}); | |
break; //not found | |
} | |
if (!next_token->second) //token valid, but not ignored | |
{ | |
_current = std::optional(next_token->first); | |
return; | |
} | |
else | |
_itr += next_token->first.value.size(); | |
} | |
} | |
std::optional<value_type> _current; | |
Iterator _itr; | |
Iterator _end; | |
}; | |
template<typename Iterator, auto... Tokens> | |
struct token_range | |
{ | |
using char_type = typename std::iterator_traits<Iterator>::value_type; | |
using value_type = token_result<char_type>; | |
using reference = value_type &; | |
constexpr static auto searcher = ctre::search<build_regex<Tokens...>()>; | |
using iterator = token_iterator<Iterator, searcher, Tokens...>; | |
constexpr token_range(token_range&) = default; | |
constexpr token_range(Iterator begin, Iterator end) : _begin(begin), _end(end) {} | |
constexpr iterator begin() const {return iterator(_begin, _end);} | |
constexpr iterator end() const {return iterator( _end, _end);} | |
constexpr bool operator==(const token_range& rhs) const {return (_begin == rhs._begin) && (_end == rhs._end);} | |
constexpr bool operator!=(const token_range& rhs) const {return (_begin != rhs._begin) || (_end != rhs._end);} | |
private: | |
Iterator _begin; | |
Iterator _end; | |
}; | |
template<token ... Tokens> | |
constexpr auto tokenize(const std::string_view &sv) | |
{ | |
using iterator = std::string_view::const_iterator ; | |
using range = token_range<iterator, Tokens...>; | |
return range(sv.begin(), sv.end()); | |
} | |
} | |
///SOME TESTS, only here because gist.. | |
constexpr auto sv = std::string_view("foo", 4); | |
constexpr auto tk1 = ctlex::token("foo", "fo+"); | |
constexpr auto tk2 = ctlex::token("bar", "bar"); | |
constexpr auto tk3 = ctlex::token("ws", "[a-zA-Z0-9_]+", std::ignore); | |
constexpr auto tk_ = tk1; | |
constexpr ctlex::token tk_cp2 = tk1; | |
constexpr ctlex::token tk_cp3(tk1); | |
using tr = ctlex::token_range<char* , tk1, tk2, tk3>; | |
constexpr auto rx = ctlex::build_regex<tk1, tk2, tk3>(); | |
static_assert(std::u32string_view(rx.begin(), rx.size()) == U"^(fo+)|^(bar)|^([a-zA-Z0-9_]+)"); | |
constexpr auto match = ctre::search<"^(fo+)|^(bar)|^(\\s+)">("foo bar"); | |
static_assert(match. get<1>()); | |
static_assert(match. get<1>().to_view() == "foo"); | |
static_assert(!match.get<2>()); | |
static_assert(!match.get<3>()); | |
constexpr auto match1 = ctre::search<"^(fo+)|^(bar)|^(\\s+)">(" bar"); | |
static_assert(!match1.get<1>()); | |
static_assert(!match1.get<2>()); | |
static_assert( match1.get<3>()); | |
static_assert( match1.get<3>().to_view() == " "); | |
constexpr auto match2 = ctre::search<"^(fo+)|^(bar)|^(\\s+)">("bar"); | |
static_assert(!match2. get<1>()); | |
static_assert( match2.get<2>()); | |
static_assert( match2.get<2>().to_view() == "bar"); | |
static_assert(!match2.get<3>()); | |
constexpr auto seq = ctlex::tokenize<ctlex::token("foo", "fo+"), | |
ctlex::token("bar", "bar"), | |
ctlex::token("ws", "\\s+", std::ignore)>("fooo bar"); | |
static_assert(seq.begin()); | |
constexpr auto start = *seq.begin(); | |
static_assert(start.is<"foo">()); | |
static_assert(start.value == "fooo"); | |
constexpr auto next = std::next(seq.begin()); | |
static_assert(next); | |
static_assert(next->is<"bar">()); | |
static_assert(next->value == "bar"); | |
constexpr auto eend = std::next(seq.begin(), 2); | |
static_assert(seq.end() == eend); | |
#endif | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment