Skip to content

Instantly share code, notes, and snippets.

@unixod
Last active June 30, 2019 12:46
Show Gist options
  • Save unixod/368a70f0235a596d9f62b8f545512915 to your computer and use it in GitHub Desktop.
Save unixod/368a70f0235a596d9f62b8f545512915 to your computer and use it in GitHub Desktop.
NFA to DFA (a part of not yet published library). For the entry point see a function makeDfa at the end of the file.
/*
* Copyright (c) 2019 Eldar Zakirov
*
* Dear Reader,
*
* This software can be freely used for any purposes that don't go against
* norms of Islam. Hence, this software shouldn't be used to implement, fix
* or improve the solutions for riba-based banks and other organizations
* that are prohibited in Islam.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#ifndef RE2MEM_NFA_TO_DFA_H
#define RE2MEM_NFA_TO_DFA_H
#include <map>
#include <set>
#include <queue>
#include <functional>
#include "re2mem/nfa.h"
#include "re2mem/definitions.h"
namespace re2mem {
using NfaStateIdx = std::size_t;
// A single byte symbol decoder. Consumes one re2mem::Symbol
// and produces corresponding std::byte.
struct SingleByteSymbolDecoder {
using SymbolType = std::byte;
using ConsumedNumberOfConsecutiveStates = std::size_t;
static std::pair<SymbolType, ConsumedNumberOfConsecutiveStates>
decode(const Nfa& nfa, NfaStateIdx i)
{
assert(i < nfa.size());
return {std::get<re2mem::Symbol>(nfa[i]).ch, 1};
}
};
/// A lightweight reference to a subset of NFA states.
template<typename SymbolDecoder>
struct NfaStatesSubsetRef {
using SymbolType = typename SymbolDecoder::SymbolType;
NfaStatesSubsetRef(const Nfa& nfa, std::vector<NfaStateIdx> statesSubset = {})
: nfa_{nfa}
{
// If a set of states explicitly isn't specified, then construct
// the start set subset (epsilon closure over start set).
if (!nfa.empty() && statesSubset.empty()) {
statesSubset.push_back(0);
}
assert((nfa.empty() || !statesSubset.empty()) &&
"If NFA is empty, then the set of sets must be empty too"
"because only valid states are accepted by the constructor.");
std::vector<std::byte> auxData;
auxData.reserve(sizeof(std::size_t));
std::set<NfaStateIdx> visitedStates;
for (const auto& i : statesSubset) {
assert((i <= nfa.size()) &&
"NfaStatesSubsetRef must be initialized with set of valid states."
"If i == nfa.size() then i is final state, despite the fact that"
"physically it isn't present in nfa.");
// Skip already processed states.
if (visitedStates.insert(i).second == false) {
continue;
}
std::queue<NfaStateIdx> statesToVisit;
statesToVisit.push(i);
while (!statesToVisit.empty()) {
auto s = statesToVisit.front();
statesToVisit.pop();
// If state is final skip its further processing, just mark
// that currently built subset contains a final state. This
// mark is used in the informative public method 'isAnyFinal'.
if (s == nfa.size()) {
someOfTheStatesIsFinal_ = true;
continue;
}
assert(s < nfa.size());
utils::match(nfa[s],
[s, this](const re2mem::Symbol&) {
states_.push_back(s);
},
[](const re2mem::Range&) {
throw std::runtime_error{INTERNAL_ERROR "Not yet implemented."};
},
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Alt<re2mem::Direction::Bakward> a) {
auto [_, notVisited] = visitedStates.insert(s+1);
if (notVisited) {
statesToVisit.push(s+1);
auxData.push_back(a.next);
auto offset = re2mem::utils::deserializeInteger(auxData);
auxData.clear();
statesToVisit.push(s-offset);
}
},
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Alt<re2mem::Direction::Forward> a) {
auto [_, notVisited] = visitedStates.insert(s+1);
if (notVisited) {
statesToVisit.push(s+1);
auxData.push_back(a.next);
auto offset = re2mem::utils::deserializeInteger(auxData);
auxData.clear();
statesToVisit.push(s+offset);
}
},
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Jump<re2mem::Direction::Bakward> j) {
auto [_, notVisited] = visitedStates.insert(s+1);
if (notVisited) {
auxData.push_back(j.next);
auto offset = re2mem::utils::deserializeInteger(auxData);
auxData.clear();
statesToVisit.push(s-offset);
}
},
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Jump<re2mem::Direction::Forward> j) {
auto [_, notVisited] = visitedStates.insert(s+1);
if (notVisited) {
auxData.push_back(j.next);
auto offset = re2mem::utils::deserializeInteger(auxData);
auxData.clear();
statesToVisit.push(s+offset);
}
},
[&auxData](re2mem::Aux aux) {
auxData.push_back(aux.b);
}
);
}
}
}
std::map<SymbolType, NfaStatesSubsetRef> outgoingEdges() const
{
// Form an auxiliary map holding a set of outgoing edges.
std::map<SymbolType, std::vector<NfaStateIdx>> aux;
for (const auto& s : states_) {
assert(s < nfa_.size() &&
"All states must be valid (i.e. belong to NFA). The final state, due to "
"its ephemeral nature is represented by someOfTheStatesIsFinal_ flag.");
assert(std::holds_alternative<re2mem::Symbol>(nfa_[s]) &&
"Due to epsilon closure searching at construction time, categories of all"
"states must be re2mem::Symbol.");
// Read a symbol of transition.
auto [symbol, numberOfConsumedStates] = SymbolDecoder::decode(nfa_, s);
assert(numberOfConsumedStates > 0);
// Get an index of target state.
const auto t = s + numberOfConsumedStates;
aux[symbol].push_back(t);
}
// Form a result by simply taking a map gotten at previous step
// and turning its second element into an instance of NfaStatesSubsetRef.
std::map<SymbolType, NfaStatesSubsetRef> result;
for (auto& [s, v] : aux) {
[[maybe_unused]] auto [_, inserted] =
result.emplace(s, NfaStatesSubsetRef{nfa_, std::move(v)});
assert(inserted);
}
return result;
}
/// Check if any of references states is final.
bool isAnyFinal() const
{
return someOfTheStatesIsFinal_;
}
// For serving as a key in std::map.
bool operator < (const NfaStatesSubsetRef& oth) const noexcept
{
assert((&nfa_ == &oth.nfa_) && "The sebsets must belong to the same NFA.");
return states_ < oth.states_;
}
private:
const Nfa& nfa_;
std::vector<NfaStateIdx> states_; // Synthetic states aren't stored.
bool someOfTheStatesIsFinal_ = false;
};
template<typename SymbolDecoder>
inline NfaStatesSubsetRef<SymbolDecoder>& epsilonClosure(NfaStatesSubsetRef<SymbolDecoder>& statesSubset)
{
// No explicit searching of epsilon closure for generic version of
// NfaStatesSubsetRef is required. This is drawn by the fact that the
// searching of epsilone closure is done in constructor of the class.
return statesSubset;
}
/// @name DFA
/// @{
/// A reference to a DFA state.
using DfaStateIdx = std::size_t;
/// A DFA state.
template<typename Symbol>
struct DfaState {
std::map<Symbol, DfaStateIdx> transitions;
bool isFinal = false;
};
/// A DFA
template<typename Symbol>
using Dfa = std::vector<DfaState<Symbol>>;
/// @}
/// A DFA builder.
template<typename NfaStatesSubset>
struct DfaBuilder {
using SymbolType = typename NfaStatesSubset::SymbolType;
/// A lightweight reference to a certain DFA state.
class DfaStateRef {
public:
DfaStateRef(Dfa<SymbolType>& dfa, DfaStateIdx& dfaStateIdx, const NfaStatesSubset& nfaStatesSubsetRef) noexcept
: dfa_{dfa}, dfaStateIdx_{dfaStateIdx}, nfaStatesSubsetRef_{nfaStatesSubsetRef}
{}
// Prevent putting rvalues.
DfaStateRef(DfaStateIdx&& dfaStateRef, NfaStatesSubset&& nfaStatesSubsetRef) = delete;
/// Return a subset of NFA states corresponding to this DFA state.
const NfaStatesSubset& nfaStatesSubset() const noexcept
{
return nfaStatesSubsetRef_;
}
/// Add to a DFA a transition on @a s from this state to @a target state.
void addTransition(SymbolType s, DfaStateRef target)
{
assert((dfaStateIdx_ < dfa_.size()) && "The source state must exist in DFA.");
assert((target.dfaStateIdx_ < dfa_.size()) && "The source state must exist in DFA.");
assert((&dfa_ == &target.dfa_) && "Both source and target states must belong to the same DFA.");
dfa_[dfaStateIdx_].transitions[s] = target.dfaStateIdx_;
}
private:
Dfa<SymbolType>& dfa_;
DfaStateIdx& dfaStateIdx_;
const NfaStatesSubset& nfaStatesSubsetRef_;
};
/// Get a DFA state for specified subset of NFA states. If the DFA
/// doesn't contain the requested DFA state, it is first created
/// and then returned.
///
/// In case of a creation, if any state within @a nfaStatesSubset
/// is final, then the created DFA state is set to be final too.
///
/// @return A pair consisting of a requested DFA state and a bool
/// denoting whether of not the state is brand new (wasn't existed
/// before).
std::pair<DfaStateRef, bool> getOrAddDfaStateFor(NfaStatesSubset nfaStatesSubset)
{
const DfaStateIdx nextNonExistentState = dfa_.size();
auto [i, inserted] = m_.emplace(nfaStatesSubset, nextNonExistentState);
if (inserted) {
dfa_.emplace_back(DfaState<SymbolType>{});
dfa_.back().isFinal = i->first.isAnyFinal();
}
return {DfaStateRef{dfa_, i->second, i->first}, inserted};
}
/// Get resulting DFA.
Dfa<SymbolType> getDfa()
{
dfa_.shrink_to_fit();
return dfa_;
}
private:
/// A DFA
Dfa<SymbolType> dfa_;
/// An map between NFA states subsets and corresponding DFA states.
///
/// @note Since instances of DfaStateRef internaly store references
/// to elements of container, the container type must ensure the
/// validity of references to its elements in case of adding new
/// elements to the container.
///
/// @todo Think about using std::unordered_map.
std::map<NfaStatesSubset, DfaStateIdx> m_;
};
template<typename NfaStatesSubset>
inline auto makeDfa(NfaStatesSubset nfaStartState)
{
/// A builder of resulting DFA.
DfaBuilder<NfaStatesSubset> dfaBuilder;
/// A work list.
std::queue<typename decltype(dfaBuilder)::DfaStateRef> queue;
auto [dfaStartState, _] = dfaBuilder.getOrAddDfaStateFor(epsilonClosure(nfaStartState));
queue.push(dfaStartState);
while (!queue.empty()) {
auto dfaState = std::move(queue.front());
queue.pop();
for (auto [symbol, nfaTargetStates] : dfaState.nfaStatesSubset().outgoingEdges()) {
auto [dfaTargetState, created] = dfaBuilder.getOrAddDfaStateFor(epsilonClosure(nfaTargetStates));
dfaState.addTransition(symbol, dfaTargetState);
if (created) {
queue.push(dfaTargetState);
}
}
}
return dfaBuilder.getDfa();
}
} // namespace re2mem
#endif // RE2MEM_NFA_TO_DFA_H
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment