Skip to content

Instantly share code, notes, and snippets.

@artemklevtsov
Last active August 16, 2019 08:05
Show Gist options
  • Save artemklevtsov/299a452617185ad1325c6b98746f342c to your computer and use it in GitHub Desktop.
Save artemklevtsov/299a452617185ad1325c6b98746f342c to your computer and use it in GitHub Desktop.
Rcpp CRLF string split
// [[Rcpp::plugins(cpp17)]]
#include <Rcpp.h>
#include <regex>
// [[Rcpp::export(rng=false)]]
Rcpp::CharacterVector str_split1(const char* s) {
Rcpp::CharacterVector out;
std::regex re("\r\n");
auto it = std::cregex_token_iterator(s, s + std::strlen(s), re, -1);
auto end = std::cregex_token_iterator();
while(it != end) {
out.push_back(*it);
++it;
}
return out;
}
// [[Rcpp::export(rng=false)]]
Rcpp::CharacterVector str_split2(const char* s) {
std::istringstream ss(s);
std::string line;
Rcpp::CharacterVector out;
while (std::getline(ss, line)) {
if (*line.rbegin() == '\r') {
line.erase(line.size() - 1);
}
if (line.empty()) {
continue;
}
out.push_back(line);
}
return out;
}
// [[Rcpp::export(rng=false)]]
Rcpp::CharacterVector str_split3(const std::string& s) {
std::size_t n_chars = s.size();
Rcpp::CharacterVector out;
static std::string eol = "\r\n";
std::string::size_type pos = 0, next;
while ((next = s.find(eol, pos)) != std::string::npos) {
std::string line = s.substr(pos, next - pos);
pos = next + eol.size();
if (line.empty()) {
continue;
}
out.push_back(line);
}
if (pos < n_chars) {
out.push_back(s.substr(pos, n_chars - pos));
}
return out;
}
// [[Rcpp::export(rng=false)]]
Rcpp::CharacterVector str_split4(const char* s) {
std::size_t n_chars = std::strlen(s);
Rcpp::CharacterVector out;
std::string_view sv = s;
static std::string eol = "\r\n";
std::string::size_type pos = 0, next;
while ((next = sv.find(eol, pos)) != std::string::npos) {
std::string_view line = sv.substr(pos, next - pos);
pos = next + eol.size();
if (line.empty()) {
continue;
}
out.push_back(line);
}
if (pos < n_chars) {
out.push_back(sv.substr(pos, next - pos));
}
return out;
}
// [[Rcpp::export(rng=false)]]
std::vector<std::string> str_split5(const std::string& s) {
std::size_t n_chars = s.size();
std::vector<std::string> out;
static std::string eol = "\r\n";
std::string::size_type pos = 0, next;
while ((next = s.find(eol, pos)) != std::string::npos) {
std::string line = s.substr(pos, next - pos);
pos = next + eol.size();
if (line.empty()) {
continue;
}
out.push_back(line);
}
if (pos < n_chars) {
out.push_back(s.substr(pos, n_chars - pos));
}
return out;
}
// [[Rcpp::export(rng=false)]]
std::vector<std::string> str_split6(const char* s) {
std::size_t n_chars = std::strlen(s);
std::vector<std::string> out;
std::string_view sv = s;
static std::string eol = "\r\n";
std::string::size_type pos = 0, next;
while ((next = sv.find(eol, pos)) != std::string::npos) {
std::string_view line = sv.substr(pos, next - pos);
pos = next + eol.size();
if (line.empty()) {
continue;
}
out.emplace_back(line);
}
if (pos < n_chars) {
out.emplace_back(sv.substr(pos, next - pos));
}
return out;
}
// [[Rcpp::export(rng=false)]]
std::vector<std::string> str_split7(const char* s) {
std::istringstream ss(s);
std::string line;
std::vector<std::string> out;
while (std::getline(ss, line)) {
if (*line.rbegin() == '\r') {
line.erase(line.size() - 1);
}
if (line.empty()) {
continue;
}
out.push_back(line);
}
return out;
}
/***R
n = 200
x = paste(seq_len(n), rep("TEST TEST TEST", n), collapse = "\r\n")
bench::mark(
strsplit(x, "\r\n", TRUE)[[1L]],
strsplit(x, "\r\n", FALSE)[[1L]],
strsplit(x, "\r\n", FALSE, TRUE)[[1L]],
str_split1(x),
str_split2(x),
str_split3(x),
str_split4(x),
str_split5(x),
str_split6(x),
str_split7(x),
iterations = 5000
)
*/
@artemklevtsov
Copy link
Author

artemklevtsov commented Aug 16, 2019

   expression                                 min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
   <bch:expr>                             <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>
 1 strsplit(x, "\r\n", TRUE)[[1L]]         49.7µs  50.6µs    19067.    1.61KB     0     5000     0
 2 strsplit(x, "\r\n", FALSE)[[1L]]       288.3µs 292.6µs     3307.    1.61KB     0     5000     0
 3 strsplit(x, "\r\n", FALSE, TRUE)[[1L]] 589.8µs 602.2µs     1608.    1.61KB     0     5000     0
 4 str_split1(x)                          198.5µs 203.9µs     4782.  164.59KB    12.5   4987    13
 5 str_split2(x)                          117.9µs   123µs     7706.  164.59KB    20.1   4987    13
 6 str_split3(x)                            118µs 123.1µs     7822.  164.59KB    20.4   4987    13
 7 str_split4(x)                            121µs 126.3µs     7508.  164.59KB    18.1   4988    12
 8 str_split5(x)                           25.1µs  26.6µs    35900.    1.61KB     0     5000     0
 9 str_split6(x)                           24.6µs    26µs    37115.    1.61KB     0     5000     0
10 str_split7(x)                           24.7µs  26.8µs    36099.    1.61KB     7.22  4999     1
# … with 5 more variables: total_time <bch:tm>, result <list>, memory <list>, time <list>, gc <list>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment