Skip to content

Instantly share code, notes, and snippets.

@cdglove
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cdglove/20e8b7034fabcf627cab to your computer and use it in GitHub Desktop.
Save cdglove/20e8b7034fabcf627cab to your computer and use it in GitHub Desktop.
// token.cpp
// Test harness for tokinzing
// g++ -I$BOOST_ROOT -O3 -std=c++11
#include <iostream>
#include <set>
#include <string>
#include <sstream>
#include <functional>
#include <boost/tokenizer.hpp>
#include <boost/range/algorithm/transform.hpp>
#include <boost/range/algorithm/find_if.hpp>
#include <boost/range/algorithm/copy.hpp>
#include <boost/algorithm/string/trim_all.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/xpressive/xpressive.hpp>
#include <cassert>
#include <iterator>
#include <regex>
void tokenize_csv(std::string raw_csv, std::set<std::string>& dest_set)
{
boost::tokenizer<boost::escaped_list_separator<char>> value_list(raw_csv);
boost::transform(
value_list,
std::inserter(dest_set, dest_set.begin()),
std::bind(boost::trim_all_copy<std::string>, std::placeholders::_1, std::locale())
);
dest_set.erase("");
}
void tokenize_csv_opt(std::string raw_csv, std::set<std::string>& dest_set)
{
boost::char_separator<char> sep(" \n\r\t,");
boost::tokenizer<boost::char_separator<char>> value_list(raw_csv, sep);
dest_set.insert(value_list.begin(), value_list.end());
}
void parse_csv(std::string raw_csv, std::set<std::string>& dest_set)
{
size_t s_curr = 0;
size_t s_end = raw_csv.size();
while (s_curr < s_end)
{
size_t delim = raw_csv.find_first_of( "," , s_curr);
if (delim == std::string::npos)
delim = s_end;
std::string value = raw_csv.substr(s_curr, delim - s_curr);
if (!value.empty())
{
char const* whitespace = " \r\n\t";
size_t val_start = value.find_first_not_of(whitespace);
size_t val_end = value.find_last_not_of(whitespace) + 1;
if(val_start != val_end && val_start != std::string::npos )
{
dest_set.insert(value.substr(val_start, val_end - val_start));
}
}
s_curr = delim + 1;
}
}
void split_csv(std::string raw_csv, std::set<std::string>& dest_set)
{
std::vector<boost::iterator_range<std::string::iterator>> result;
boost::algorithm::split(result, raw_csv, [](char c)
{
return c == ',';
});
boost::transform(result, std::inserter(dest_set, dest_set.begin()),
[](boost::iterator_range<std::string::iterator> r)
{
auto begin = r.begin();
auto end = r.end();
while(begin != end && std::isspace(*begin))
++begin;
while(begin != end && std::isspace(*(end - 1)))
--end;
return std::string(begin, end);
}
);
dest_set.erase("");
}
void xpressive_csv(std::string raw_csv, std::set<std::string>& dest_set)
{
using namespace boost::xpressive;
sregex re = +(+alnum);
std::transform(
sregex_iterator(raw_csv.begin(), raw_csv.end(), re),
sregex_iterator(),
std::inserter(dest_set, dest_set.begin()),
[](smatch const& match)
{
return match[0];
}
);
}
void regex_csv(std::string raw_csv, std::set<std::string>& dest_set)
{
std::regex re("\\w+");
std::transform(
std::sregex_iterator(raw_csv.begin(), raw_csv.end(), re),
std::sregex_iterator(),
std::inserter(dest_set, dest_set.begin()),
[](std::smatch const& match)
{
return match.str();
}
);
}
void print_usage()
{
std::cout << "Usage: token <mode> [tokenize[-opt], parse[-opt], split]" << std::endl;
}
static const int kNumValues = 10000000;
int main(int argc, char** argv)
{
if(argc != 2)
{
print_usage();
return 0;
}
std::string mode = argv[1];
// Generate some test data.
std::stringstream csv;
for(int i = 0; i < kNumValues; ++i)
{
csv << i << "\t,\n\t ";
}
std::set<std::string> values;
if(mode == "tokenize")
{
std::cout << "Tokenizing " << kNumValues << " csv values.\n";
tokenize_csv(csv.str(), values);
std::cout << "Tokenized " << values.size() << " values.\n";
}
else if(mode == "tokenize-opt")
{
std::cout << "Tokenizing " << kNumValues << " csv values.\n";
tokenize_csv_opt(csv.str(), values);
std::cout << "Tokenized " << values.size() << " values.\n";
}
else if(mode == "parse")
{
std::cout << "Parsing " << kNumValues << " csv values.\n";
parse_csv(csv.str(), values);
std::cout << "Parsed " << values.size() << " values.\n";
}
else if(mode == "split")
{
std::cout << "Parsing " << kNumValues << " csv values.\n";
split_csv(csv.str(), values);
std::cout << "Parsed " << values.size() << " values.\n";
}
else if(mode == "xpress")
{
std::cout << "xpressing " << kNumValues << " csv values.\n";
xpressive_csv(csv.str(), values);
std::cout << "xpressed " << values.size() << " values.\n";
}
else if(mode == "regex")
{
std::cout << "xpressing " << kNumValues << " csv values.\n";
regex_csv(csv.str(), values);
std::cout << "xpressed " << values.size() << " values.\n";
}
else
{
print_usage();
}
if(kNumValues == 10)
{
boost::copy(values, std::ostream_iterator<std::string>(std::cout));
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment