Skip to content

Instantly share code, notes, and snippets.

@artemklevtsov
Last active October 15, 2019 15:48
Show Gist options
  • Save artemklevtsov/cf969cdcdd410a99d5c90d76c9b0fb68 to your computer and use it in GitHub Desktop.
Save artemklevtsov/cf969cdcdd410a99d5c90d76c9b0fb68 to your computer and use it in GitHub Desktop.
utf8ToInt Rcpp implementation
// [[Rcpp::plugins(cpp17)]]
#include <Rcpp.h>
#include <codecvt>
using namespace Rcpp;
// [[Rcpp::export(rng=false)]]
std::vector<unsigned long> utf8_to_int(const std::string& x) {
std::size_t n = x.size();
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> cv;
auto str32 = cv.from_bytes(x.c_str());
std::vector<unsigned long> res;
res.reserve(n / 2);
for (auto c: str32) {
res.emplace_back(static_cast<unsigned long>(c));
}
return res;
}
// [[Rcpp::export(rng=false)]]
std::vector<unsigned long> utf8_to_int_safe(String x) {
if (x.get_encoding() != CE_UTF8) {
stop("'x' must be UTF-8 encoded string.");
}
std::string_view sv(x.get_cstring());
std::size_t n = sv.size();
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> cv;
auto str32 = cv.from_bytes(sv.data());
std::vector<unsigned long> res;
res.reserve(n / 2);
for (auto c: str32) {
res.emplace_back(static_cast<unsigned long>(c));
}
return res;
}
/*** R
bench::mark(
utf8_to_int("Привет"),
utf8ToInt("Привет")
)
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment