Skip to content

Instantly share code, notes, and snippets.

@sehe
Last active July 8, 2020 11:07
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sehe/8678988 to your computer and use it in GitHub Desktop.
Save sehe/8678988 to your computer and use it in GitHub Desktop.
converting regular expressions to dot graphs (with boost spirit)
*.o
a.out
test
.*.swp
*~
*.png
tags
#pragma once
#include <set>
#include <vector>
#include <string>
#include <boost/tuple/tuple.hpp> // for charset
#include <boost/tuple/tuple_comparison.hpp>
#include <boost/variant.hpp> // for tree nodes
#include <boost/optional.hpp> // for multiplicity upperbound
namespace ast
{
struct multiplicity
{
unsigned minoccurs;
boost::optional<unsigned> maxoccurs;
bool greedy;
multiplicity(unsigned minoccurs = 1, boost::optional<unsigned> maxoccurs = 1)
: minoccurs(minoccurs), maxoccurs(maxoccurs), greedy(true)
{ }
bool unbounded() const { return !maxoccurs; }
bool repeating() const { return !maxoccurs || *maxoccurs > 1; }
};
struct charset
{
bool negated;
using range = boost::tuple<char, char>; // from, till
using element = boost::variant<char, range>;
std::set<element> elements;
// TODO: single set for loose elements, simplify() method
};
struct start_of_match {};
struct end_of_match {};
struct any_char {};
struct group;
typedef boost::variant< // unquantified expression
start_of_match,
end_of_match,
any_char,
charset,
std::string, // literal
boost::recursive_wrapper<group> // sub expression
> simple;
struct atom // quantified simple expression
{
simple expr;
multiplicity mult;
};
using sequence = std::vector<atom>;
using alternative = std::vector<sequence>;
using regex = boost::variant<atom, sequence, alternative>;
struct group {
alternative root;
group() = default;
group(alternative root) : root(std::move(root)) { }
};
}
#include "ast.hpp"
#include "parser.hpp"
#include <set>
#include <map>
#include <sstream>
#include <functional>
static std::string multiplicity_text(ast::multiplicity const& m) {
std::ostringstream os;
//
if (m.minoccurs == 0 && m.unbounded()) os << '*' ;
else if (m.minoccurs == 1 && m.unbounded()) os << '+' ;
else if (m.unbounded()) os << '{' << m.minoccurs << ",}" ;
// cannot be unbounded beyond this point
else if (m.minoccurs == 1 && *m.maxoccurs == 1) { ; }
else if (m.minoccurs == *m.maxoccurs) os << '{' << m.minoccurs << '}' ;
else if (m.minoccurs == 0 && *m.maxoccurs == 1) os << '?' ;
else if (m.minoccurs == 0) os << "{," << *m.maxoccurs << '}' ;
else os << '{' << m.minoccurs << "," << *m.maxoccurs << "}" ;
// non-greedyness
if (m.repeating() && !m.greedy)
os << "?";
return os.str();
}
static std::ostream& escape_into(std::ostream& os, char v, bool escape_dquote = false) {
switch(v)
{
case '\\': return os << "\\\\";
case '\n': return os << "\\n";
case '\t': return os << "\\t";
case '\r': return os << "\\r";
case '\b': return os << "\\b";
case '\0': return os << "\\0";
case '"': return os << (escape_dquote? "\\\"" : "\"");
default:
return os << v; // TODO more escapes
}
}
static std::ostream& escape_into(std::ostream& os, std::string const& v, bool escape_dquote = false) {
for(auto ch : v)
escape_into(os, ch, escape_dquote);
return os;
}
struct regex_tostring : boost::static_visitor<>
{
std::ostream& os;
int id = 0;
regex_tostring(std::ostream& os) : os(os) {}
~regex_tostring() { os << "\n"; }
void operator()(ast::alternative const & a) const {
for (size_t i = 0; i<a.size(); ++i)
{
if (i) os << '|';
(*this)(a[i]);
}
}
void operator()(ast::atom const & v) const {
boost::apply_visitor(*this, v.expr);
(*this)(v.mult);
}
void operator()(ast::start_of_match const & v) const { os << '^'; }
void operator()(ast::end_of_match const & v) const { os << '$'; }
void operator()(ast::any_char const & v) const { os << '.'; }
void operator()(ast::group const & v) const { os << '('; (*this)(v.root); os << ')'; }
void operator()(char const v) const { escape_into(os, v); }
void operator()(std::string const & v) const { escape_into(os, v); }
void operator()(std::vector<ast::atom> const & v) const { for(auto& atom : v) (*this)(atom); }
void operator()(ast::multiplicity const & m) const { os << multiplicity_text(m); }
void operator()(ast::charset const & v) const {
os << '[';
if (v.negated) os << '^';
for(auto& el : v.elements) boost::apply_visitor(*this, el);
os << ']';
}
void operator()(ast::charset::range const & v) const {
using std::get;
escape_into(os, get<0>(v)) << "-";
escape_into(os, get<1>(v));
}
};
struct regex_todigraph : boost::static_visitor<std::string>
{
std::ostream& os;
regex_todigraph(std::ostream& os, std::string const& label) : os(os)
{
static int graph_id = 0;
os << "subgraph cluster_regex" << std::to_string(++graph_id) << " {\n";
os << "style=\"dashed\";\n";
os << "node[fontname=\"times,italic\"];\n";
os << "label=\"";
escape_into(os, label, true) << "\";\n\n";
}
~regex_todigraph()
{
os << "}\n";
}
private:
using attributes = std::map<std::string, std::string>;
static attributes merge(attributes const& a, attributes const& b)
{
attributes c = a;
for (auto& e : b)
c.insert(e);
return c;
}
static attributes box() { return { { "shape", "box" } }; }
static attributes diamond() { return { { "shape", "diamond" }, { "style", "rounded," } }; }
static attributes sequence() { return { { "shape", "Mrecord" } }; }
static attributes special() { return { { "shape", "box" }, { "style", "filled," }, { "fillcolor", "gray" }, { "fontsize", "10" } }; }
static attributes literal() { return { { "fontname", "Courier" } }; }
std::string emit_node(std::string const& caption, ast::multiplicity mult=ast::multiplicity(), attributes attrs = attributes()) const
{
static int id = 0;
auto const thisnode = "node" + std::to_string(id++);
auto const quantifier_text = multiplicity_text(mult);
attrs.emplace("fontname", "Times,italic");
attrs.emplace("label", caption + quantifier_text);
if (!mult.minoccurs)
attrs["style"] += "dotted,";
os << thisnode << "[";
for(auto& attr : attrs)
{
escape_into(os, attr.first, true);
os << "=\"";
escape_into(os, attr.second, true) << "\",";
}
os << "];\n";
if (mult.repeating())
{
// add self-referential edge for repeat (I know, it isn't a state chart...)
os << thisnode << " -> " << thisnode
<< " [label=\"" << quantifier_text << "\""
<< (mult.greedy? ",color=red,arrowhead=dot" : ",color=blue,arrowhead=odot")
<< "];\n";
}
return thisnode;
}
std::string emit_node(std::string const& caption, attributes const& style) const
{
return emit_node(caption, ast::multiplicity(), style);
}
void emit_vertices(std::string const& parent, std::set<std::string> const& children) const {
os << parent << " -> {";
std::copy(begin(children), end(children), std::ostream_iterator<std::string>(os, "; "));
os << "}\n";
}
public:
std::string operator()(ast::alternative const& a) const {
if (a.size()>1)
{
std::set<std::string> children;
for (auto& branch: a)
children.insert((*this)(branch));
std::string const thisnode = emit_node("alternative", diamond());
emit_vertices(thisnode, children);
return thisnode;
} else
{
return (*this)(a[0]);
}
}
std::string operator()(ast::start_of_match const& v, ast::multiplicity mult = {}) const {
return emit_node("start-of-match", mult, special());
}
std::string operator()(ast::end_of_match const& v, ast::multiplicity mult = {}) const {
return emit_node("end-of-match", mult, special());
}
std::string operator()(ast::any_char const& v, ast::multiplicity mult = {}) const {
return emit_node("any", mult, special());
}
std::string operator()(ast::group const& v, ast::multiplicity mult = {}) const {
{
auto child = (*this)(v.root);
std::string const thisnode = emit_node("group", mult);
emit_vertices(thisnode, { child });
return thisnode;
}
}
std::string operator()(ast::charset const& v, ast::multiplicity mult = {}) const {
std::set<std::string> children;
for (auto& el: v.elements)
children.insert(boost::apply_visitor(*this, el));
std::string const thisnode = emit_node(
v.negated?"negated-charset":"charset",
mult,
{ { "fontcolor", v.negated?"red":"blue" } });
emit_vertices(thisnode, children);
return thisnode;
}
std::string operator()(char v) const {
std::ostringstream os;
os << "'";
escape_into(os, v) << "'";
return emit_node(os.str(), merge(box(), literal()));
}
std::string operator()(ast::charset::range const& v) const {
std::ostringstream os;
using std::get;
os << "'";
escape_into(os, get<0>(v)) << "…";
escape_into(os, get<1>(v)) << "'";
return emit_node(os.str(), merge(box(), literal()));
}
std::string operator()(std::string const& v, ast::multiplicity mult = {}) const {
std::ostringstream os;
os << (v.length()==1?"'":"\"");
escape_into(os, v);
os << (v.length()==1?"'":"\"");
return emit_node(os.str(), mult, merge(box(), literal()));
}
std::string operator()(std::vector<ast::atom> const& v, ast::multiplicity mult = {}) const {
if (v.size()>1)
{
std::set<std::string> children;
for (auto& atom: v)
children.insert((*this)(atom));
std::string const thisnode = emit_node("sequence", mult, sequence());
emit_vertices(thisnode, children);
return thisnode;
} else
{
for (auto& atom: v)
return (*this)(atom);
}
return "";
}
std::string operator()(ast::atom const& atom) const {
#if 1
// simplification that hides the 'atom' intermediate node and applies
// `multiplicity` decoration directly on the simple node or sub group
return boost::apply_visitor(std::bind(std::ref(*this), std::placeholders::_1, atom.mult), atom.expr);
#else
// the "classical approach" emits the child and links it to an "atom" node
auto child = boost::apply_visitor(*this, atom.expr);
if (atom.mult == ast::multiplicity())
return child;
auto anode = emit_node("atom", atom.mult);
emit_vertices(anode, { child });
return anode;
#endif
}
};
void check_roundtrip(ast::regex tree, std::string input)
{
std::ostringstream os;
regex_tostring str(os);
boost::apply_visitor(str, tree);
if (os.str() != input)
std::cerr << "WARNING: '" << input << "' -> '" << os.str() << "'\n";
}
int main()
{
std::cout << "digraph common {\n";
for (std::string pattern: {
"abc?",
"ab+c",
"(ab)+c",
"[^-a\\-f-z\"\\]aaaa-]?",
"abc|d",
"a?",
".*?(a|b){,9}?",
"(XYZ)|(123)",
})
{
std::cout << "// ================= " << pattern << " ========\n";
ast::regex tree;
if (doParse(pattern, tree))
{
check_roundtrip(tree, pattern);
regex_todigraph printer(std::cout, pattern);
boost::apply_visitor(printer, tree);
}
}
std::cout << "}\n";
}
all:test
CPPFLAGS+=-std=c++0x -Wall -pedantic
CPPFLAGS+=-g -O0
CPPFLAGS+=-isystem ~/custom/boost/
# CPPFLAGS+=-fopenmp
# CPPFLAGS+=-march=native
# LDFLAGS+=-L ~/custom/boost/stage/lib/ -Wl,-rpath,/home/sehe/custom/boost/stage/lib
# LDFLAGS+=-lboost_system -lboost_regex -lboost_thread -lpthread -lboost_iostreams -lboost_serialization
# CXX=/usr/lib/gcc-snapshot/bin/g++
# CC=/usr/lib/gcc-snapshot/bin/gcc
# CXX=clang++
# CC=clang
%.o: %.cpp ast.hpp
$(CXX) $(CPPFLAGS) $< -c -o $@
test: main.o parser.o
$(CXX) $(CPPFLAGS) $^ -o $@ $(LDFLAGS)
// #define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include "parser.hpp"
#include "ast.hpp"
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
BOOST_FUSION_ADAPT_STRUCT(ast::charset,
(bool, negated)
(std::set<ast::charset::element>, elements))
BOOST_FUSION_ADAPT_STRUCT(ast::atom,
(ast::simple, expr)
(ast::multiplicity, mult))
#if defined(BOOST_SPIRIT_DEBUG)
namespace ast { // DEBUG facilities
template <typename T> std::ostream& operator<<(std::ostream& os, T const& v) {
using std::operator<<;
return os << std::string(typeid(v).name());
}
}
#endif
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
template <typename It>
struct parser : qi::grammar<It, ast::alternative()>
{
parser() : parser::base_type(alternative)
{
using namespace qi;
using phx::construct;
using ast::multiplicity;
alternative = sequence % '|';
sequence = *atom;
simple =
(group)
| (charset)
| ('.' >> qi::attr(ast::any_char()))
| ('^' >> qi::attr(ast::start_of_match()))
| ('$' >> qi::attr(ast::end_of_match()))
// optimize literal tree nodes by grouping unquantified literal chars
| (as_string [ +(literal >> !char_("{?+*")) ])
| (as_string [ literal ]) // lone char/escape + explicit_quantifier
;
atom = (simple >> quantifier); // quantifier may be implicit
explicit_quantifier =
// bounded ranges:
lit('?') [ _val = construct<multiplicity>( 0, 1) ]
| ('{' >> uint_ >> '}' ) [ _val = construct<multiplicity>(_1, _1) ]
// repeating ranges can be marked non-greedy:
| (
lit('+') [ _val = construct<multiplicity>( 1, boost::none) ]
| lit('*') [ _val = construct<multiplicity>( 0, boost::none) ]
| ('{' >> uint_ >> ",}") [ _val = construct<multiplicity>(_1, boost::none) ]
| ('{' >> uint_ >> "," >> uint_ >> '}') [ _val = construct<multiplicity>(_1, _2) ]
| ("{," >> uint_ >> '}' ) [ _val = construct<multiplicity>( 0, _1) ]
) >> -lit('?') [ phx::bind(&multiplicity::greedy, _val) = false ]
;
quantifier = explicit_quantifier | attr(ast::multiplicity());
charset = '['
>> (lit('^') >> attr(true) | attr(false)) // negated
>> *(range | charset_el)
> ']'
;
range = charset_el >> '-' >> charset_el;
group = '(' >> alternative >> ')';
literal = unescape | ~char_("\\+*?.^$|{()") ;
unescape = ('\\' > char_);
// helper to optionally unescape waiting for raw ']'
charset_el = !lit(']') >> (unescape|char_);
BOOST_SPIRIT_DEBUG_NODES(
(alternative) (sequence)
(simple) (atom)
(explicit_quantifier) (quantifier)
(charset) (charset_el) (range) (group) (literal) (unescape)
)
}
private:
qi::rule<It, ast::alternative()> alternative;
qi::rule<It, ast::sequence()> sequence;
qi::rule<It, ast::atom()> atom;
qi::rule<It, ast::simple()> simple;
qi::rule<It, ast::multiplicity()> explicit_quantifier, quantifier;
qi::rule<It, ast::charset()> charset;
qi::rule<It, ast::charset::range()> range;
qi::rule<It, ast::group()> group;
qi::rule<It, char()> literal, unescape, charset_el;
};
bool doParse(const std::string& input, ast::regex& data)
{
typedef std::string::const_iterator It;
static const parser<It> p;
try
{
auto f(begin(input)), l(end(input));
bool ok = qi::parse(f,l,p,data);
if (!ok)
std::cerr << "parse failed: '" << std::string(f,l) << "'\n";
if (f!=l) std::cerr << "trailing unparsed: '" << std::string(f,l) << "'\n";
return ok;
} catch(const qi::expectation_failure<It>& e)
{
std::string frag(e.first, e.last);
std::cerr << e.what() << " at '" << frag << "'\n";
return false;
}
}
#pragma once
#include "ast.hpp"
#include <string>
bool doParse(const std::string& input, ast::regex& data);
@tomalakgeretkal
Copy link

I think you should rename the gist.

@sehe
Copy link
Author

sehe commented Feb 21, 2014

@tomalakgeretkal anything specific in mind?

@dieterziehe
Copy link

Hello, thank you for your article on stackoverflow with the example
AST_of_a_regular_expression_string. One thing, what I didn't understand is :
BOOST_FUSION_ADAPT_STRUCT
I commented it out an I extended struct charset_t and struct atom_t
with constructors like yours in struct group_t

I could compile, but the program went wrong
(check_roundtrip failed). I didn't understand the whole magic from
BOOST_FUSION_ADAPT_STRUCT in your case boost/spirit.

Do you know, what exactly BOOST_FUSION_ADAPT_STRUCT
generate (which constructors with which content) ?

Would be nice, If you could answer

Dieter Ziehe, Germany, Berlin

I find boost interesting, but at least 10 times more difficult than normal C++11/14/17/20

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment