Created
November 30, 2016 02:28
-
-
Save daniel-j-h/35f2098804f660cb9a559e7d3c1311e9 to your computer and use it in GitHub Desktop.
Simple Wavelet Tree Experiments - https://en.wikipedia.org/wiki/Wavelet_Tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <vector> | |
#include <iterator> | |
#include <algorithm> | |
#include <unordered_set> | |
#include <iostream> | |
#include <ostream> | |
#include <sstream> | |
#include <mapbox/variant.hpp> | |
#include <boost/dynamic_bitset.hpp> | |
namespace detail { | |
struct Empty { }; | |
template <typename Symbol> | |
struct Leaf { | |
Symbol symbol; | |
}; | |
template <typename Symbol> | |
struct Node; | |
template <typename Symbol> | |
using TreeSumTy = mapbox::util::variant<Empty | |
, Leaf<Symbol> | |
, mapbox::util::recursive_wrapper<Node<Symbol>>>; | |
} // ns detail | |
template <typename Symbol> | |
struct WaveletTree : detail::TreeSumTy<Symbol> { | |
using detail::TreeSumTy<Symbol>::TreeSumTy; | |
using Empty = detail::Empty; | |
using Leaf = detail::Leaf<Symbol>; | |
using Node = detail::Node<Symbol>; | |
template <typename Iter> | |
static WaveletTree Encode(Iter first, Iter last); | |
template <typename Range> | |
static WaveletTree Encode(const Range& range); | |
}; | |
namespace detail { | |
template <typename Symbol> | |
struct Node { | |
using Symbols = std::basic_string<Symbol>; | |
using BitVector = boost::dynamic_bitset<std::uint64_t>; | |
Symbols input; // for visualization | |
BitVector subsets; | |
WaveletTree<Symbol> left; | |
WaveletTree<Symbol> right; | |
}; | |
} // ns detail | |
template <typename Symbol> | |
template <typename Iter> | |
WaveletTree<Symbol> WaveletTree<Symbol>::Encode(Iter first, Iter last) { | |
if (first == last) | |
return Empty{}; | |
if (std::all_of(first, last, [v = *first] (auto each) { return each == v; })) | |
return Leaf{*first}; | |
typename Node::Symbols symbols(first, last); | |
std::sort(begin(symbols), end(symbols)); | |
const auto it = std::unique(begin(symbols), end(symbols)); | |
symbols.erase(it, std::end(symbols)); | |
const auto n = symbols.size(); | |
using Set = std::unordered_set<Symbol>; | |
Set leftSymbols(begin(symbols), begin(symbols) + (n / 2)); | |
typename Node::BitVector subsets; | |
typename Node::Symbols left, right; | |
std::for_each(first, last, [&] (auto each) { | |
if (leftSymbols.count(each) > 0) { | |
subsets.push_back(false); | |
left.push_back(each); | |
} else { | |
subsets.push_back(true); | |
right.push_back(each); | |
} | |
}); | |
return Node{{first, last}, subsets, Encode(left), Encode(right)}; // recursive, rework | |
} | |
template <typename Symbol> | |
template <typename Range> | |
WaveletTree<Symbol> WaveletTree<Symbol>::Encode(const Range& range) { | |
using std::begin; | |
using std::end; | |
return Encode(begin(range), end(range)); | |
} | |
template <typename Symbol> | |
struct WaveletTreePrinter { | |
std::ostream& out; | |
std::size_t indent; | |
void operator()(const typename WaveletTree<Symbol>::Empty&) { | |
if (indent > 0) | |
out << std::string(indent, ' ') << "- "; | |
out << "<Empty>\n"; | |
} | |
void operator()(const typename WaveletTree<Symbol>::Leaf& leaf) { | |
if (indent > 0) | |
out << std::string(indent, ' ') << "- "; | |
out << "<Leaf: '" << leaf.symbol << "'>\n"; | |
} | |
void operator()(const typename WaveletTree<Symbol>::Node& node) { | |
if (indent > 0) | |
out << std::string(indent, ' ') << "- "; | |
const auto bits = [] (const auto& bitvec) { | |
std::stringstream stream; | |
stream << bitvec; | |
auto fmt = stream.str(); | |
std::reverse(begin(fmt), end(fmt)); | |
return fmt; | |
}; | |
out << "<Node: '" << node.input << "' : '" << bits(node.subsets) << "'>\n"; | |
Show(node.left, out, indent > 0 ? indent + 4 : 2); | |
Show(node.right, out, indent > 0 ? indent + 4 : 2); | |
} | |
}; | |
template <typename Symbol> | |
void Show(const WaveletTree<Symbol>& wt, std::ostream& out = std::cout, std::size_t indent = 0) { | |
WaveletTreePrinter<Symbol> printer{out, indent}; | |
apply_visitor(printer, wt); | |
} | |
int main() { | |
using Symbol = char; | |
using Wt = WaveletTree<Symbol>; | |
std::string input{"abracadabra"}; | |
const auto root = Wt::Encode(input); | |
Show(root); | |
} | |
// <Node: 'abracadabra' : '00101010010'> | |
// - <Node: 'abaaaba' : '0100010'> | |
// - <Leaf: 'a'> | |
// - <Leaf: 'b'> | |
// - <Node: 'rcdr' : '1011'> | |
// - <Leaf: 'c'> | |
// - <Node: 'rdr' : '101'> | |
// - <Leaf: 'd'> | |
// - <Leaf: 'r'> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was a first try at a very high-level non-functional Wavelet tree construction. A more detailed explanation can be found in the literature. Some introductory material is available in these blog posts:
Side node: using Mapbox.Variant since Boost.Variant generates an ICE with GCC 6.2 and Boost 1.62 :sadpanda: