-
-
Save cdglove/20e8b7034fabcf627cab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// token.cpp | |
// Test harness for tokinzing | |
// g++ -I$BOOST_ROOT -O3 -std=c++11 | |
#include <iostream> | |
#include <set> | |
#include <string> | |
#include <sstream> | |
#include <functional> | |
#include <boost/tokenizer.hpp> | |
#include <boost/range/algorithm/transform.hpp> | |
#include <boost/range/algorithm/find_if.hpp> | |
#include <boost/range/algorithm/copy.hpp> | |
#include <boost/algorithm/string/trim_all.hpp> | |
#include <boost/algorithm/string/split.hpp> | |
#include <boost/xpressive/xpressive.hpp> | |
#include <cassert> | |
#include <iterator> | |
#include <regex> | |
void tokenize_csv(std::string raw_csv, std::set<std::string>& dest_set) | |
{ | |
boost::tokenizer<boost::escaped_list_separator<char>> value_list(raw_csv); | |
boost::transform( | |
value_list, | |
std::inserter(dest_set, dest_set.begin()), | |
std::bind(boost::trim_all_copy<std::string>, std::placeholders::_1, std::locale()) | |
); | |
dest_set.erase(""); | |
} | |
void tokenize_csv_opt(std::string raw_csv, std::set<std::string>& dest_set) | |
{ | |
boost::char_separator<char> sep(" \n\r\t,"); | |
boost::tokenizer<boost::char_separator<char>> value_list(raw_csv, sep); | |
dest_set.insert(value_list.begin(), value_list.end()); | |
} | |
void parse_csv(std::string raw_csv, std::set<std::string>& dest_set) | |
{ | |
size_t s_curr = 0; | |
size_t s_end = raw_csv.size(); | |
while (s_curr < s_end) | |
{ | |
size_t delim = raw_csv.find_first_of( "," , s_curr); | |
if (delim == std::string::npos) | |
delim = s_end; | |
std::string value = raw_csv.substr(s_curr, delim - s_curr); | |
if (!value.empty()) | |
{ | |
char const* whitespace = " \r\n\t"; | |
size_t val_start = value.find_first_not_of(whitespace); | |
size_t val_end = value.find_last_not_of(whitespace) + 1; | |
if(val_start != val_end && val_start != std::string::npos ) | |
{ | |
dest_set.insert(value.substr(val_start, val_end - val_start)); | |
} | |
} | |
s_curr = delim + 1; | |
} | |
} | |
void split_csv(std::string raw_csv, std::set<std::string>& dest_set) | |
{ | |
std::vector<boost::iterator_range<std::string::iterator>> result; | |
boost::algorithm::split(result, raw_csv, [](char c) | |
{ | |
return c == ','; | |
}); | |
boost::transform(result, std::inserter(dest_set, dest_set.begin()), | |
[](boost::iterator_range<std::string::iterator> r) | |
{ | |
auto begin = r.begin(); | |
auto end = r.end(); | |
while(begin != end && std::isspace(*begin)) | |
++begin; | |
while(begin != end && std::isspace(*(end - 1))) | |
--end; | |
return std::string(begin, end); | |
} | |
); | |
dest_set.erase(""); | |
} | |
void xpressive_csv(std::string raw_csv, std::set<std::string>& dest_set) | |
{ | |
using namespace boost::xpressive; | |
sregex re = +(+alnum); | |
std::transform( | |
sregex_iterator(raw_csv.begin(), raw_csv.end(), re), | |
sregex_iterator(), | |
std::inserter(dest_set, dest_set.begin()), | |
[](smatch const& match) | |
{ | |
return match[0]; | |
} | |
); | |
} | |
void regex_csv(std::string raw_csv, std::set<std::string>& dest_set) | |
{ | |
std::regex re("\\w+"); | |
std::transform( | |
std::sregex_iterator(raw_csv.begin(), raw_csv.end(), re), | |
std::sregex_iterator(), | |
std::inserter(dest_set, dest_set.begin()), | |
[](std::smatch const& match) | |
{ | |
return match.str(); | |
} | |
); | |
} | |
void print_usage() | |
{ | |
std::cout << "Usage: token <mode> [tokenize[-opt], parse[-opt], split]" << std::endl; | |
} | |
static const int kNumValues = 10000000; | |
int main(int argc, char** argv) | |
{ | |
if(argc != 2) | |
{ | |
print_usage(); | |
return 0; | |
} | |
std::string mode = argv[1]; | |
// Generate some test data. | |
std::stringstream csv; | |
for(int i = 0; i < kNumValues; ++i) | |
{ | |
csv << i << "\t,\n\t "; | |
} | |
std::set<std::string> values; | |
if(mode == "tokenize") | |
{ | |
std::cout << "Tokenizing " << kNumValues << " csv values.\n"; | |
tokenize_csv(csv.str(), values); | |
std::cout << "Tokenized " << values.size() << " values.\n"; | |
} | |
else if(mode == "tokenize-opt") | |
{ | |
std::cout << "Tokenizing " << kNumValues << " csv values.\n"; | |
tokenize_csv_opt(csv.str(), values); | |
std::cout << "Tokenized " << values.size() << " values.\n"; | |
} | |
else if(mode == "parse") | |
{ | |
std::cout << "Parsing " << kNumValues << " csv values.\n"; | |
parse_csv(csv.str(), values); | |
std::cout << "Parsed " << values.size() << " values.\n"; | |
} | |
else if(mode == "split") | |
{ | |
std::cout << "Parsing " << kNumValues << " csv values.\n"; | |
split_csv(csv.str(), values); | |
std::cout << "Parsed " << values.size() << " values.\n"; | |
} | |
else if(mode == "xpress") | |
{ | |
std::cout << "xpressing " << kNumValues << " csv values.\n"; | |
xpressive_csv(csv.str(), values); | |
std::cout << "xpressed " << values.size() << " values.\n"; | |
} | |
else if(mode == "regex") | |
{ | |
std::cout << "xpressing " << kNumValues << " csv values.\n"; | |
regex_csv(csv.str(), values); | |
std::cout << "xpressed " << values.size() << " values.\n"; | |
} | |
else | |
{ | |
print_usage(); | |
} | |
if(kNumValues == 10) | |
{ | |
boost::copy(values, std::ostream_iterator<std::string>(std::cout)); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment