Skip to content

Instantly share code, notes, and snippets.

@OrbitZore
Last active April 13, 2022 15:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OrbitZore/7b7ed7d660bd08102922a42f13a0ea15 to your computer and use it in GitHub Desktop.
Save OrbitZore/7b7ed7d660bd08102922a42f13a0ea15 to your computer and use it in GitHub Desktop.
regex to NFA,NFA to DFA,simplify DFA and match strings
/*
Please use C++20
*/
#include <bits/stdc++.h>
using namespace std;
struct FAST_IO {
FAST_IO() {
ios_base::sync_with_stdio(false);
cin.tie(NULL);
}
} __fast_io;
#if __cplusplus < 201402L
template <class T, class U = T>
T exchange(T& obj, U&& new_value) {
T old_value = move(obj);
obj = forward<U>(new_value);
return old_value;
}
#endif
#define cons(a, ...) a = typename decay<decltype(a)>::type(__VA_ARGS__)
using INT = int;
#define x first
#define y second
//#define int long long
#define pb push_back
#define eb emplace_back
#define all(a) (a).begin(), (a).end()
auto& _ = std::ignore;
using ll = long long;
template <class T>
using vec = vector<T>;
template <bool B, class T = void>
using enableif_t = typename enable_if<B, T>::type;
#define DEF_COULD(name, exp) \
template <class U> \
struct name { \
template <class T> \
constexpr static auto is(int i) -> decltype(exp, true) { \
return true; \
} \
template <class T> \
constexpr static bool is(...) { \
return false; \
} \
static const bool value = is<U>(1); \
};
#define DEF_CAN(name, exp) DEF_COULD(can##name, exp)
#define ENABLE(T, name) enableif_t<can##name<T>::value>(1)
#define ENABLEN(T, name) enableif_t<!can##name<T>::value>(1)
#define FOR_TUPLE enableif_t<i != tuple_size<T>::value>(1)
#define END_TUPLE enableif_t<i == tuple_size<T>::value>(1)
#define FOR_TUPLET(T) enableif_t<i != tuple_size<T>::value>(1)
#define END_TUPLET(T) enableif_t<i == tuple_size<T>::value>(1)
#define DEF_INF(name, exp) \
constexpr struct { \
template <class T> \
constexpr operator T() const { \
return numeric_limits<T>::exp(); \
} \
} name;
DEF_CAN(Out, (cout << *(T*)(0)))
DEF_CAN(For, begin(*(T*)(0)))
DEF_INF(INF, max) DEF_INF(MINF, min)
template <size_t i, class T>
auto operator>>(istream& is, T& r) -> decltype(END_TUPLE, is) {
return is;
}
template <size_t i = 0, class T>
auto operator>>(istream& is, T& r) -> decltype(FOR_TUPLE, is) {
is >> get<i>(r);
return operator>><i + 1>(is, r);
}
template <size_t i, class... Args>
auto operator>>(istream& is, const tuple<Args&...>& r)
-> decltype(END_TUPLET(tuple<Args&...>), is) {
return is;
}
template <size_t i = 0, class... Args>
auto operator>>(istream& is, const tuple<Args&...>& r)
-> decltype(FOR_TUPLET(tuple<Args&...>), is) {
is >> get<i>(r);
return operator>><i + 1>(is, r);
}
template <class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLE(T, Out), c + 1);
template <size_t i, class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLEN(T, For), END_TUPLE, c + 1);
template <size_t i = 0, class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLEN(T, For), FOR_TUPLE, c + 1);
template <class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLEN(T, Out), ENABLE(T, For), c + 1);
template <class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLE(T, Out), c + 1) {
os << cv;
while (*c != '}') c++;
return c + 1;
}
template <size_t i, class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLEN(T, For), END_TUPLE, c + 1) {
return c;
}
template <size_t i, class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLEN(T, For), FOR_TUPLE, c + 1) {
while (*c != '{') os << *c++;
c = __format(os, c, get<i>(cv));
return __format<i + 1>(os, c, cv);
}
template <class T>
auto __format(ostream& os, const char* c, const T& cv)
-> decltype(ENABLEN(T, Out), ENABLE(T, For), c + 1) {
const char* ct = c + 1;
if (cv.size() == 0) {
int b = 1;
while (1) {
if (*ct == '}') b--;
if (*ct == '{') b++;
if (!b) break;
ct++;
}
} else {
for (auto& i : cv) {
const char* cc = c + 1;
while (*cc != '{') os << *cc++;
cc = __format(os, cc, i);
while (*cc != '}') os << *cc++;
ct = cc;
}
}
return ct + 1;
}
void _format(ostream& os, const char* c) {
while (*c != '{' && *c != '\0') os << *c++;
}
template <class T, class... Args>
void _format(ostream& os, const char* c, const T& a, const Args&... rest) {
while (*c != '{' && *c != '\0') os << *c++;
if (*c == '{') c = __format(os, c, a);
_format(os, c, rest...);
}
template <class... Args>
string format(const char* c, const Args&... rest) {
ostringstream os;
_format(os, c, rest...);
return os.str();
}
template <class... Args>
ostream& print(const char* c, const Args&... rest) {
return _format(cout, c, rest...), cout;
}
template <class... Args>
ostream& println(const char* c, const Args&... rest) {
return print(c, rest...) << endl;
}
template <class T>
T cinget() {
T a;
cin >> a;
return a;
}
template <class keyT, template <class...> class containerT>
struct Node {
containerT<keyT, Node*> to;
};
template <class NodeT>
struct FA {
using node_type = NodeT;
list<NodeT> nodes;
template <class... Args>
NodeT* newnode(Args&&... args) {
nodes.emplace_back(forward<Args>(args)...);
return &nodes.back();
}
template <class... Args>
void resize(int n, Args... args) {
for (int i = 0; i < n; i++) nodes.emplace_back(args...);
}
NodeT* startnodeptr;
map<NodeT*, set<int>> endset;
};
using DFANode = Node<char, map>;
using NDFANode = Node<char, multimap>;
using NFANode = Node<string, multimap>;
using DFA = FA<DFANode>;
using NDFA = FA<NDFANode>;
using NFA = FA<NFANode>;
template <class W, class... IT, class funcT>
void BFS(funcT func, W ini, IT... iniargs) {
list<W> q{move(ini), move(iniargs)...};
while (q.size()) {
func(q, move(q.front()));
q.pop_front();
}
}
template <class T>
concept sizeable = requires(const T& a) {
size(a);
};
template <class T>
concept unsizeable = !sizeable<T>;
template <sizeable T>
bool checkEmpty(const T& a) {
return size(a) == 0;
}
template <unsizeable T>
bool checkEmpty(const T& a) {
return !a;
}
template <class FAT, typename NodeT = typename FAT::node_type>
map<NodeT*, set<NodeT*>> _closure(FAT& fat) {
map<NodeT*, set<NodeT*>> s;
for (auto& node : fat.nodes) {
auto& si = s[&node];
BFS(
[&](auto& q, NodeT* node) {
si.insert(node);
for (const auto& [key, nnodeptr] : node->to) {
if (!si.count(nnodeptr) && checkEmpty(key)) {
q.push_back(nnodeptr);
si.insert(nnodeptr);
}
}
},
&node);
}
return s;
}
DFA nfa2dfa(const NFA& nfa) {
NDFA ndfa;
{
// Step1 string 2 one char
map<const NFANode*, NDFANode*> m;
auto mget = [&](const NFANode* w) -> NDFANode* {
auto& mi = m[w];
if (mi == NULL) mi = ndfa.newnode();
return mi;
};
for (auto& nnode : nfa.nodes) {
NDFANode* nodeptr = mget(&nnode);
for (const auto& [str, nnodeptr] : nnode.to) {
NDFANode* nxt = mget(nnodeptr);
string_view strn = str;
while (strn.size() > 1) {
NDFANode* nnode = ndfa.newnode();
nnode->to.insert({strn.back(), exchange(nxt, nnode)});
strn.remove_suffix(1);
}
nodeptr->to.insert({strn.size() ? strn.front() : 0, nxt});
}
}
for (const auto& nodeptr : nfa.endset)
if (nodeptr.first && m[nodeptr.first])
ndfa.endset.insert({m[nodeptr.first], nodeptr.second});
ndfa.startnodeptr = m[nfa.startnodeptr];
}
// Step2 subset get DFN
DFA dfa;
auto closure = _closure(ndfa);
map<set<NDFANode*>, DFANode*> ndfa2dfa;
ndfa2dfa[closure[ndfa.startnodeptr]] = dfa.newnode();
BFS(
[&](auto& q, set<NDFANode*> subset) {
set<char> charset;
for (auto& node : subset)
for (const auto& [c, v] : node->to)
if (c) charset.insert(c);
for (char c : charset) {
set<NDFANode*> nsubset;
for (auto snode : subset) {
auto erange = snode->to.equal_range(c);
for (const auto& [c, v] :
ranges::subrange(erange.first, erange.second))
for (auto vc : closure[v]) nsubset.insert(vc);
}
if (!ndfa2dfa[nsubset]) {
ndfa2dfa[nsubset] = dfa.newnode();
q.push_back(nsubset);
}
ndfa2dfa[subset]->to[c] = ndfa2dfa[nsubset];
}
},
closure[ndfa.startnodeptr]);
for (auto& [subset, node] : ndfa2dfa) {
for (auto i : subset)
if (ndfa.endset.count(i)) {
dfa.endset[node].insert(all(ndfa.endset[i]));
}
}
dfa.startnodeptr = ndfa2dfa[closure[ndfa.startnodeptr]];
return dfa;
}
DFA simplify(const DFA& dfa) {
DFA ndfa;
// make set of equivalence point
// and map to ndfa point
map<set<const DFANode*>, DFANode*> si;
// dfa point set to ndfa num
{
// Prepare U
set<const DFANode*> s2;
for (auto& node : dfa.nodes)
if (!dfa.endset.contains(const_cast<DFANode*>(&node))) s2.insert(&node);
set<set<const DFANode*>> subsets{move(s2)};
{
set<const DFANode*> s;
for (const auto& [node, symbs] : dfa.endset) {
s.insert(node);
}
subsets.emplace(s);
}
auto getsetaddr = [&](const DFANode* x) -> const set<const DFANode*>* {
for (auto& subset : subsets)
if (subset.contains(x)) return &subset;
exit(0);
};
BFS(
[&](auto& q, const set<const DFANode*> subset) {
// Prepare charset
set<char> charset;
for (auto nodeptr : subset)
for (const auto& [c, v] : nodeptr->to)
if (c) charset.insert(c);
for (char c : charset) {
map<const set<const DFANode*>*, set<const DFANode*>> cat;
for (auto nodeptr : subset) {
if (auto i = nodeptr->to.find(c); i != nodeptr->to.end()) {
cat[getsetaddr(i->second)].insert(nodeptr);
} else {
cat[NULL].insert(nodeptr);
}
}
// partition
if (cat.size() >= 2) {
subsets.erase(subset);
for (const auto& [sp, s] : cat) {
subsets.insert(s);
q.push_back(move(s));
}
break;
}
}
},
*subsets.begin(), *(++subsets.begin()));
// print("{:{{} }\n}\n",subsets);
for (auto& s : subsets) si[s] = ndfa.newnode();
}
map<const DFANode*, DFANode*> imap; // dfa to ndfa point
for (const auto& [subset, nnodeptr] : si)
for (auto nodeptr : subset) imap[nodeptr] = nnodeptr;
for (const auto& [subset, nnodeptr] : si) {
set<char> charset;
for (auto nodeptr : subset)
for (const auto& [c, v] : nodeptr->to)
if (c) charset.insert(c);
for (char c : charset)
for (auto nodeptr : subset)
if (auto i = nodeptr->to.find(c); i != nodeptr->to.end()) {
nnodeptr->to.insert({c, imap[i->second]});
break;
}
}
for (const auto& [node, symset] : dfa.endset)
ndfa.endset[imap[node]].insert(all(symset));
ndfa.startnodeptr = imap[dfa.startnodeptr];
return ndfa;
}
template <class nodeT>
ostream& operator<<(ostream& os, const FA<nodeT>& fa) {
map<const nodeT*, int> w;
auto gi = [&](const nodeT* p) {
if (!w.count(p)) w[p] = w.size();
return w[p];
};
os << gi(fa.startnodeptr) << endl;
for (auto i : fa.endset) os << gi(i) << " ";
os << endl;
for (auto& node : fa.nodes) {
os << gi(&node) << " ";
for (auto& [key, nnodeptr] : node.to)
os << format("({}):{},", key, gi(nnodeptr));
os << "\n";
}
return os;
}
namespace RegexParse {
using regexExpr = string;
struct Node {
virtual ~Node() {}
};
using NodePtr = unique_ptr<Node>;
template <class T>
concept ReadableNodePtr = is_same_v<decay_t<decltype(*declval<T>())>, Node> &&
is_lvalue_reference_v<decltype(*declval<T>())>;
struct CharNode : public Node {
char c;
CharNode(char c) : c(c) {}
};
struct MulNode : public Node { // *
NodePtr S;
MulNode(NodePtr S) : S(move(S)) {}
};
struct AddNode : public Node { //+
NodePtr L, R;
AddNode(NodePtr L, NodePtr R) : L(move(L)), R(move(R)) {}
};
struct OrNode : public Node { //|
NodePtr L, R;
OrNode(NodePtr L, NodePtr R) : L(move(L)), R(move(R)) {}
};
} // namespace RegexParse
namespace RegexParse::Regex {
using RegexParse::NodePtr;
void printNode(Node* a, int space = 0, ostream& os = cout) {
for (int i = 0; i < space; i++) os << "| ";
if (auto node = dynamic_cast<CharNode*>(a)) {
os << "|--"
<< "char"
<< ":" << node->c << endl;
} else if (auto node = dynamic_cast<MulNode*>(a)) {
os << "|--"
<< "mul" << endl;
printNode(&*node->S, space + 1, os);
} else if (auto node = dynamic_cast<AddNode*>(a)) {
os << "|--"
<< "add" << endl;
printNode(&*node->R, space + 1, os);
printNode(&*node->L, space + 1, os);
} else if (auto node = dynamic_cast<OrNode*>(a)) {
os << "|--"
<< "or" << endl;
printNode(&*node->R, space + 1, os);
printNode(&*node->L, space + 1, os);
} else
assert(false);
}
unique_ptr<Node> to(string_view s) {
vector<unique_ptr<Node>> q;
vector<char> ops;
auto apply_op = [&](char op) {
if (op == '*') {
auto R = move(q.back());
q.pop_back();
q.emplace_back(new MulNode{move(R)});
} else if (op == '|' || op == '+') {
auto R = move(q.back());
q.pop_back();
auto L = move(q.back());
q.pop_back();
if (op == '+') q.emplace_back(new AddNode{move(L), move(R)});
if (op == '|') q.emplace_back(new OrNode{move(L), move(R)});
} else {
assert(false);
}
};
bool lstisAlpha = false;
for (auto ii = s.begin(); ii != s.end(); ii++) {
#define i (*ii)
lstisAlpha = [&]() {
if (i == '*') {
apply_op('*');
return true;
} else if (i == '|') {
while (ops.size() && ops.back() == '+') {
apply_op('+');
ops.pop_back();
}
ops.push_back('|');
return false;
} else if (i == '(') {
if (lstisAlpha) ops.push_back('+');
ops.push_back(i);
return false;
} else if (i == ')') {
for (char c; (c = ops.back()) != '('; ops.pop_back()) {
apply_op(c);
}
ops.pop_back();
return true;
} else if (i == '.') {
if (lstisAlpha) ops.push_back('+');
NodePtr p{new CharNode(1)};
for (char c = 2; c >= 0; c++) {
p = NodePtr{new OrNode{move(p), NodePtr{new CharNode(c)}}};
}
q.emplace_back(move(p));
return true;
} else if (i == '[') {
if (lstisAlpha) ops.push_back('+');
NodePtr p{new CharNode(1)};
advance(ii, 1);
for (char lc; i != ']'; advance(ii, 1)) {
if (i == '-') {
advance(ii, 1);
for (char c = lc + 1; c <= i; c++) {
p = NodePtr{new OrNode{move(p), NodePtr{new CharNode(c)}}};
}
} else {
p = NodePtr{new OrNode{move(p), NodePtr{new CharNode(i)}}};
}
lc = i;
}
q.emplace_back(move(p));
return true;
} else if (i == '$') {
if (lstisAlpha) ops.push_back('+');
q.emplace_back(new CharNode(0));
return true;
} else if (i == '{') {
if (lstisAlpha) ops.push_back('+');
NodePtr p{new CharNode(2)};
bitset<128> b{};
advance(ii, 1);
for (char lc; i != '}'; advance(ii, 1)) {
if (i == '-') {
advance(ii, 1);
for (char c = lc + 1; c <= i; c++) {
b[c] = 1;
}
} else {
b[i] = 1;
}
lc = i;
}
for (char c = 3; c >= 0; c++)
if (!b[c]) {
p = NodePtr{new OrNode{move(p), NodePtr{new CharNode(c)}}};
}
q.emplace_back(move(p));
return true;
} else {
if (i == '\\') advance(ii, 1);
if (lstisAlpha) {
ops.push_back('+');
}
q.emplace_back(new CharNode(i));
return true;
}
}();
#undef i
}
while (ops.size()) {
apply_op(ops.back());
ops.pop_back();
}
return move(q.front());
}
// G,start vertex,end vertex
template <ReadableNodePtr Ptr>
NFA buildNFA(Ptr&& aa) {
Node* a = &*aa;
if (auto node = dynamic_cast<CharNode*>(a)) {
NFA nfa;
NFANode *l = nfa.newnode(), *r = nfa.newnode();
nfa.startnodeptr = l;
nfa.endset = {{r, {0}}};
l->to.insert({string{node->c}, r});
return nfa;
} else if (auto node = dynamic_cast<MulNode*>(a)) {
auto nfa = buildNFA(&*node->S);
NFANode *start = nfa.startnodeptr, *end = nfa.endset.begin()->first;
NFANode *nstart = nfa.newnode(), *nend = nfa.newnode();
nfa.startnodeptr = nstart;
nfa.endset = {{nend, {0}}};
nstart->to.insert({"", start});
end->to.insert({"", nend});
nstart->to.insert({"", nend});
nend->to.insert({"", nstart});
return nfa;
} else if (auto node = dynamic_cast<AddNode*>(a)) {
NFA nfa;
auto lnfa = buildNFA(&*node->L);
auto rnfa = buildNFA(&*node->R);
NFANode* lend = lnfa.endset.begin()->first;
NFANode* rend = rnfa.endset.begin()->first;
nfa.nodes.splice(nfa.nodes.end(), move(lnfa.nodes));
nfa.nodes.splice(nfa.nodes.end(), move(rnfa.nodes));
nfa.startnodeptr = lnfa.startnodeptr;
nfa.endset = {{rend, {0}}};
lend->to.insert({"", rnfa.startnodeptr});
return nfa;
} else if (auto node = dynamic_cast<OrNode*>(a)) {
NFA nfa;
auto lnfa = buildNFA(&*node->L);
auto rnfa = buildNFA(&*node->R);
NFANode* lend = lnfa.endset.begin()->first;
NFANode* rend = rnfa.endset.begin()->first;
nfa.nodes.splice(nfa.nodes.end(), move(lnfa.nodes));
nfa.nodes.splice(nfa.nodes.end(), move(rnfa.nodes));
auto nfabegin = nfa.newnode(), nfaend = nfa.newnode();
nfabegin->to.insert({"", lnfa.startnodeptr});
nfabegin->to.insert({"", rnfa.startnodeptr});
lend->to.insert({"", nfaend});
rend->to.insert({"", nfaend});
nfa.startnodeptr = nfabegin;
nfa.endset = {{nfaend, {0}}};
return nfa;
} else
assert(false);
}
} // namespace RegexParse::Regex
using namespace RegexParse::Regex;
bool match(const DFA& dfa, string_view s) {
const DFANode* nodeptr = dfa.startnodeptr;
while (s.size()) {
char c = s.front();
if (auto nodeptrptr = nodeptr->to.find(c);
nodeptrptr != nodeptr->to.end()) {
nodeptr = nodeptrptr->second;
} else {
return false;
}
s.remove_prefix(1);
}
return dfa.endset.contains(const_cast<DFANode*>(nodeptr));
}
optional<const set<int>> match_first(const DFA& dfa, string_view& s) {
const DFANode* nodeptr = dfa.startnodeptr;
const set<int>* sp = NULL;
while (s.size()) {
char c = s.front();
if (auto nodeptrptr = nodeptr->to.find(c);
nodeptrptr != nodeptr->to.end()) {
nodeptr = nodeptrptr->second;
} else {
break;
}
if (auto ans = dfa.endset.find(const_cast<DFANode*>(nodeptr));
ans != dfa.endset.end())
sp = &ans->second;
s.remove_prefix(1);
}
if (sp) return *sp;
return {};
}
int main() {
string str;
getline(cin, str);
NodePtr ptr = to(str);
printNode(&*ptr, 0, cerr);
NFA nfa = buildNFA(&*ptr);
cerr << nfa << endl << endl;
auto dfa = nfa2dfa(nfa);
cerr << dfa << endl << endl;
auto sdfa = simplify(dfa);
regex txt_regex{str};
while (getline(cin, str) && cin) {
bool f;
assert((f = match(dfa, str)) == match(sdfa, str) &&
f == regex_match(str, txt_regex));
cout << boolalpha << f << endl;
}
}
/*
(b*|a|c*)*|def
(b|ab)*
abcd
(a|b)*(aa|bb)(a|b)*
a(a|b)*
a(ab)*
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment