Skip to content

Instantly share code, notes, and snippets.

@daniel-j-h
Created November 30, 2016 02:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daniel-j-h/35f2098804f660cb9a559e7d3c1311e9 to your computer and use it in GitHub Desktop.
Save daniel-j-h/35f2098804f660cb9a559e7d3c1311e9 to your computer and use it in GitHub Desktop.
Simple Wavelet Tree Experiments - https://en.wikipedia.org/wiki/Wavelet_Tree
#include <string>
#include <vector>
#include <iterator>
#include <algorithm>
#include <unordered_set>
#include <iostream>
#include <ostream>
#include <sstream>
#include <mapbox/variant.hpp>
#include <boost/dynamic_bitset.hpp>
namespace detail {
struct Empty { };
template <typename Symbol>
struct Leaf {
Symbol symbol;
};
template <typename Symbol>
struct Node;
template <typename Symbol>
using TreeSumTy = mapbox::util::variant<Empty
, Leaf<Symbol>
, mapbox::util::recursive_wrapper<Node<Symbol>>>;
} // ns detail
template <typename Symbol>
struct WaveletTree : detail::TreeSumTy<Symbol> {
using detail::TreeSumTy<Symbol>::TreeSumTy;
using Empty = detail::Empty;
using Leaf = detail::Leaf<Symbol>;
using Node = detail::Node<Symbol>;
template <typename Iter>
static WaveletTree Encode(Iter first, Iter last);
template <typename Range>
static WaveletTree Encode(const Range& range);
};
namespace detail {
template <typename Symbol>
struct Node {
using Symbols = std::basic_string<Symbol>;
using BitVector = boost::dynamic_bitset<std::uint64_t>;
Symbols input; // for visualization
BitVector subsets;
WaveletTree<Symbol> left;
WaveletTree<Symbol> right;
};
} // ns detail
template <typename Symbol>
template <typename Iter>
WaveletTree<Symbol> WaveletTree<Symbol>::Encode(Iter first, Iter last) {
if (first == last)
return Empty{};
if (std::all_of(first, last, [v = *first] (auto each) { return each == v; }))
return Leaf{*first};
typename Node::Symbols symbols(first, last);
std::sort(begin(symbols), end(symbols));
const auto it = std::unique(begin(symbols), end(symbols));
symbols.erase(it, std::end(symbols));
const auto n = symbols.size();
using Set = std::unordered_set<Symbol>;
Set leftSymbols(begin(symbols), begin(symbols) + (n / 2));
typename Node::BitVector subsets;
typename Node::Symbols left, right;
std::for_each(first, last, [&] (auto each) {
if (leftSymbols.count(each) > 0) {
subsets.push_back(false);
left.push_back(each);
} else {
subsets.push_back(true);
right.push_back(each);
}
});
return Node{{first, last}, subsets, Encode(left), Encode(right)}; // recursive, rework
}
template <typename Symbol>
template <typename Range>
WaveletTree<Symbol> WaveletTree<Symbol>::Encode(const Range& range) {
using std::begin;
using std::end;
return Encode(begin(range), end(range));
}
template <typename Symbol>
struct WaveletTreePrinter {
std::ostream& out;
std::size_t indent;
void operator()(const typename WaveletTree<Symbol>::Empty&) {
if (indent > 0)
out << std::string(indent, ' ') << "- ";
out << "<Empty>\n";
}
void operator()(const typename WaveletTree<Symbol>::Leaf& leaf) {
if (indent > 0)
out << std::string(indent, ' ') << "- ";
out << "<Leaf: '" << leaf.symbol << "'>\n";
}
void operator()(const typename WaveletTree<Symbol>::Node& node) {
if (indent > 0)
out << std::string(indent, ' ') << "- ";
const auto bits = [] (const auto& bitvec) {
std::stringstream stream;
stream << bitvec;
auto fmt = stream.str();
std::reverse(begin(fmt), end(fmt));
return fmt;
};
out << "<Node: '" << node.input << "' : '" << bits(node.subsets) << "'>\n";
Show(node.left, out, indent > 0 ? indent + 4 : 2);
Show(node.right, out, indent > 0 ? indent + 4 : 2);
}
};
template <typename Symbol>
void Show(const WaveletTree<Symbol>& wt, std::ostream& out = std::cout, std::size_t indent = 0) {
WaveletTreePrinter<Symbol> printer{out, indent};
apply_visitor(printer, wt);
}
int main() {
using Symbol = char;
using Wt = WaveletTree<Symbol>;
std::string input{"abracadabra"};
const auto root = Wt::Encode(input);
Show(root);
}
// <Node: 'abracadabra' : '00101010010'>
// - <Node: 'abaaaba' : '0100010'>
// - <Leaf: 'a'>
// - <Leaf: 'b'>
// - <Node: 'rcdr' : '1011'>
// - <Leaf: 'c'>
// - <Node: 'rdr' : '101'>
// - <Leaf: 'd'>
// - <Leaf: 'r'>
@daniel-j-h
Copy link
Author

This was a first try at a very high-level non-functional Wavelet tree construction. A more detailed explanation can be found in the literature. Some introductory material is available in these blog posts:

Side node: using Mapbox.Variant since Boost.Variant generates an ICE with GCC 6.2 and Boost 1.62 :sadpanda:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment