Skip to content

Instantly share code, notes, and snippets.

@alenstarx
Created April 14, 2016 03:22
Show Gist options
  • Save alenstarx/5aae2d7ca59e473fdcd0f7f118543b37 to your computer and use it in GitHub Desktop.
Save alenstarx/5aae2d7ca59e473fdcd0f7f118543b37 to your computer and use it in GitHub Desktop.
#ifndef UTIL_STRING_UTILS_HPP
#define UTIL_STRING_UTILS_HPP
// trim from start
static inline std::string &string_ltrim(std::string &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<int, int>(std::isspace))));
return s;
}
// trim from end
static inline std::string &string_rtrim(std::string &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
return s;
}
// trim from both ends
static inline std::string &string_trim(std::string &s) {
return string_ltrim(string_rtrim(s));
}
std::string string_replace(std::string& str, const std::string from, const std::string to) {
size_t start_pos = str.find(from);
if (start_pos == std::string::npos)
return str;
str.replace(start_pos, from.length(), to);
return str;
}
std::vector<std::string> string_split(const std::string& src, std::string separate_character)
{
std::vector<std::string> strs;
int separate_characterLen = separate_character.size();
int last_position = 0, index = -1;
while (-1 != (index = src.find(separate_character, last_position)))
{
std::string sss = src.substr(last_position, index - last_position);
if (sss.size())
strs.push_back(sss);
last_position = index + separate_characterLen;
}
std::string lastString = src.substr(last_position);
if (!lastString.empty())
if (lastString.size())
strs.push_back(lastString);
return strs;
}
std::string string_replace_all(const std::string &src, std::string org_str, std::string rep_str)
{
std::vector<std::string> delimVec = string_split(src, org_str);
if (delimVec.size() <= 0){
return src;
}
std::string target("");
std::vector<std::string>::iterator it = delimVec.begin();
for (; it != delimVec.end(); ++it)
{
target += (*it) + rep_str;
}
return target;
}
//http://sourceforge.net/projects/utfcpp/
#ifdef USE_UTFCPP
#include <utf8.h>
#endif
#ifdef _WINDOWS
#include <windows.h>
#endif
#include <cstring>
#include <algorithm>
#include <sstream>
#include <iomanip>
#include <vector>
namespace string_utils {
template <typename Str> struct tokenizer;
template <typename Str>
static Str to_upper(const Str& str)
{
Str temp(str);
std::transform(temp.begin(), temp.end(), temp.begin(), ::toupper);
return temp;
}
template <typename Str>
static Str to_lower(const Str& str)
{
Str temp(str);
std::transform(temp.begin(), temp.end(), temp.begin(), ::tolower);
return temp;
}
template <typename Str>
static Str trim_left(const Str& str)
{
Str temp(str);
for (auto it = temp.begin(); it != temp.end(); it++) {
if (!isspace(*it)) {
break;
}
}
if (it == temp.end()) {
temp.clear();
} else {
temp.erase(temp.begin(), it);
}
}
template <typename Str>
static Str trim_right(const Str& str)
{
Str temp(str);
for (auto it = temp.end() - 1; ;it--) {
if (!isspace(*it)) {
temp.erase(it + 1, temp.end());
break;
}
if (it == temp.begin()) {
temp.clear();
break;
}
}
return temp;
}
template <typename Str>
static Str trim(const Str& str)
{
Str temp = trim_left(str);
return trim_right(temp);
}
template <typename Str>
static bool starts_with(Str const & value, Str const & starting)
{
if (starting.size() > value.size()) return false;
return std::equal(starting.begin(), starting.end(), value.begin());
}
template <typename Str>
static bool ends_with(Str const & value, Str const & ending)
{
if (ending.size() > value.size()) return false;
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
template <typename Str>
static bool equals_ignore_case(const Str& str1, const Str& str2)
{
return to_lower(str1) == to_lower(str2);
}
template <typename T, typename Str>
static T from_string(const Str& str)
{
T obj;
std::basic_istringstream<typename Str::value_type> temp(str);
temp >> obj;
return obj;
}
template <typename Str>
static bool from_string(const Str& str)
{
bool obj;
std::basic_istringstream<typename Str::value_type> temp(str);
temp >> std::boolalpha >> obj;
return obj;
}
template <typename T, typename Str>
static T from_hex_string(const Str& str)
{
T obj;
std::basic_istringstream<typename Str::value_type> temp(str);
temp >> std::hex >> obj;
return obj;
}
template <typename Str, typename T>
static Str to_string(const T& var)
{
std::basic_ostringstream<typename Str::value_type> temp;
temp << var;
return temp.str();
}
template <typename Str>
static Str to_string(bool var)
{
std::basic_ostringstream<typename Str::value_type> temp;
temp << std::boolalpha << var;
return temp.str();
}
template <typename Str, typename T>
static Str to_hex_string(const T& var, int width)
{
std::basic_ostringstream<typename Str::value_type> temp;
temp << std::hex;
if(width > 0)
{
temp << std::setw(width) << std::setfill<typename Str::value_type>('0');
}
temp << var;
return temp.str();
}
template <typename Str>
static std::vector<Str> split(Str const& str, Str const& delimiters)
{
std::vector<Str> ss;
tokenizer<Str> token(str, delimiters);
while (token.next_token())
{
ss.push_back(token.get_token());
}
return ss;
}
static std::wstring utf8_to_unicode(std::string const& str)
{
size_t source_len = str.length();
if (source_len == 0)
{
return std::wstring();
}
const char* source = str.c_str();
size_t dest_len = 0;
#ifdef _WINDOWS
dest_len = MultiByteToWideChar(CP_UTF8, 0, source, source_len, 0, 0);
#else
dest_len = mbstowcs(nullptr, source, source_len);
#endif
if (dest_len <= 0)
{
return std::wstring();
}
std::wstring dest(dest_len, L'\0');
#ifdef _WINDOWS
dest_len = MultiByteToWideChar(CP_UTF8, 0, source, source_len, &dest[0], dest.length());
#else
dest_len = mbstowcs(&dest[0], source, source_len);
#endif
if (dest_len <= 0)
{
return std::wstring();
}
return dest;
}
static std::string unicode_to_utf8(std::wstring const& str)
{
size_t source_len = str.length();
if (source_len <= 0)
{
return std::string();
}
const wchar_t * source = str.c_str();
size_t dest_len = 0;
#ifdef _WINDOWS
dest_len = WideCharToMultiByte(CP_UTF8, 0, source, source_len, 0, 0, 0, 0);
#else
dest_len = wcstombs(nullptr, source, source_len);
#endif
if (dest_len <= 0)
{
return std::string();
}
std::string dest(dest_len, '\0');
#ifdef _WINDOWS
dest_len = WideCharToMultiByte(CP_UTF8, 0, source, source_len, &dest[0], dest_len, 0, 0);
#else
dest_len = wcstombs(&dest[0], source, source_len);
#endif
if (dest_len <= 0)
{
return std::string();
}
return dest;
}
static std::wstring ansi_to_unicode(std::string const& str)
{
const char* source = str.c_str();
size_t source_len = str.length();
if (source_len == 0)
{
return std::wstring();
}
size_t dest_len = 0;
#ifdef _WINDOWS
dest_len = MultiByteToWideChar(CP_ACP, 0, source, source_len, 0, 0);
#else
dest_len = mbstowcs(nullptr, source, source_len);
#endif
if (dest_len <= 0)
{
return std::wstring();
}
std::wstring dest(dest_len, L'\0');
#ifdef _WINDOWS
dest_len = MultiByteToWideChar(CP_ACP, 0, source, source_len, &dest[0], dest.length());
#else
dest_len = mbstowcs(&dest[0], source, source_len);
#endif
if (dest_len <= 0)
{
return std::wstring();
}
return dest;
}
static std::string unicode_to_ansi(std::wstring const& str)
{
size_t source_len = str.length();
if (source_len <= 0)
{
return std::string();
}
const wchar_t * source = str.c_str();
size_t dest_len = 0;
#ifdef _WINDOWS
dest_len = WideCharToMultiByte(CP_ACP, 0, source, source_len, 0, 0, 0, 0);
#else
dest_len = wcstombs(nullptr, source, source_len);
#endif
if (dest_len <= 0)
{
return std::string();
}
std::string dest(dest_len, '\0');
#ifdef _WINDOWS
dest_len = WideCharToMultiByte(CP_ACP, 0, source, source_len, &dest[0], dest_len, 0, 0);
#else
dest_len = wcstombs(&dest[0], source, source_len);
#endif
if (dest_len <= 0)
{
return std::string();
}
return dest;
}
static std::string utf8_to_gbk(std::string const& str)
{
std::wstring temp = utf8_to_unicode(str);
#ifdef _WINDOWS
#else
setlocale(LC_ALL, "zh_CN.GBK")
#endif
std::string dest = unicode_to_ansi(temp);
#ifdef _WINDOWS
#else
setlocale(LC_ALL, "")
#endif
return dest;
}
static std::string gbk_to_utf8(std::string const& str)
{
#ifdef _WINDOWS
#else
setlocale(LC_ALL, "zh_CN.GBK")
#endif
std::wstring temp = ansi_to_unicode(str);
#ifdef _WINDOWS
#else
setlocale(LC_ALL, "")
#endif
std::string dest = unicode_to_utf8(temp);
return dest;
}
static std::string utf8_to_locale(std::string const& str)
{
#ifdef _WINDOWS
return utf8_to_gbk(str);
#else
return std::string(str);
#endif
}
static std::string locale_to_utf8(std::string const& str)
{
#ifdef _WINDOWS
return gbk_to_utf8(str);
#else
return std::string(str);
#endif
}
#ifdef USE_UTFCPP
static bool is_valid_utf8(std::string const& str)
{
return utf8::is_valid(str.begin(), str.end());
}
static bool starts_with_bom(std::string const&str)
{
return utf8::starts_with_bom(str.begin(), str.end());
}
#endif
static bool base64_decode(std::string const &input, std::string & output)
{
static const char nop = -1;
static const char decoding_data[] = {
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop, 62, nop,nop,nop, 63,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61,nop,nop, nop,nop,nop,nop,
nop, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,nop, nop,nop,nop,nop,
nop,26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop
};
unsigned int input_length=input.size();
const char * input_ptr = input.data();
// allocate space for output string
output.clear();
output.reserve(((input_length+2)/3)*4);
// for each 4-bytes sequence from the input, extract 4 6-bits sequences by droping first two bits
// and regenerate into 3 8-bits sequence
for (unsigned int i=0; i<input_length;i++) {
char base64code0;
char base64code1;
char base64code2 = 0; // initialized to 0 to suppress warnings
char base64code3;
base64code0 = decoding_data[static_cast<int>(input_ptr[i])];
if(base64code0==nop) // non base64 character
return false;
if(!(++i<input_length)) // we need at least two input bytes for first byte output
return false;
base64code1 = decoding_data[static_cast<int>(input_ptr[i])];
if(base64code1==nop) // non base64 character
return false;
output += ((base64code0 << 2) | ((base64code1 >> 4) & 0x3));
if(++i<input_length) {
char c = input_ptr[i];
if(c =='=') { // padding , end of input
// BOOST_ASSERT( (base64code1 & 0x0f)==0);
return true;
}
base64code2 = decoding_data[static_cast<int>(input_ptr[i])];
if(base64code2==nop) // non base64 character
return false;
output += ((base64code1 << 4) & 0xf0) | ((base64code2 >> 2) & 0x0f);
}
if(++i<input_length) {
char c = input_ptr[i];
if(c =='=') { // padding , end of input
// BOOST_ASSERT( (base64code2 & 0x03)==0);
return true;
}
base64code3 = decoding_data[static_cast<int>(input_ptr[i])];
if(base64code3==nop) // non base64 character
return false;
output += (((base64code2 << 6) & 0xc0) | base64code3 );
}
}
return true;
}
static bool base64_encode(std::string const &input, std::string & output)
{
static const char encoding_data[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
unsigned int input_length=input.size();
const char * input_ptr = input.data();
// allocate space for output string
output.clear();
output.reserve(((input_length+2)/3)*4);
// for each 3-bytes sequence from the input, extract 4 6-bits sequences and encode using
// encoding_data lookup table.
// if input do not contains enough chars to complete 3-byte sequence,use pad char '='
for (unsigned int i=0; i<input_length;i++) {
int base64code0=0;
int base64code1=0;
int base64code2=0;
int base64code3=0;
base64code0 = (input_ptr[i] >> 2) & 0x3f; // 1-byte 6 bits
output += encoding_data[base64code0];
base64code1 = (input_ptr[i] << 4 ) & 0x3f; // 1-byte 2 bits +
if (++i < input_length) {
base64code1 |= (input_ptr[i] >> 4) & 0x0f; // 2-byte 4 bits
output += encoding_data[base64code1];
base64code2 = (input_ptr[i] << 2) & 0x3f; // 2-byte 4 bits +
if (++i < input_length) {
base64code2 |= (input_ptr[i] >> 6) & 0x03; // 3-byte 2 bits
base64code3 = input_ptr[i] & 0x3f; // 3-byte 6 bits
output += encoding_data[base64code2];
output += encoding_data[base64code3];
} else {
output += encoding_data[base64code2];
output += '=';
}
} else {
output += encoding_data[base64code1];
output += '=';
output += '=';
}
}
return true;
}
static std::string url_decode(const std::string& str)
{
char decode_buf[3];
std::string result;
result.reserve(str.size());
for (std::string::size_type pos = 0; pos < str.size(); ++pos) {
switch(str[pos]) {
case '+':
// convert to space character
result += ' ';
break;
case '%':
// decode hexadecimal value
if (pos + 2 < str.size()) {
decode_buf[0] = str[++pos];
decode_buf[1] = str[++pos];
decode_buf[2] = '\0';
char decoded_char = static_cast<char>( strtol(decode_buf, 0, 16) );
// decoded_char will be '\0' if strtol can't parse decode_buf as hex
// (or if decode_buf == "00", which is also not valid).
// In this case, recover from error by not decoding.
if (decoded_char == '\0') {
result += '%';
pos -= 2;
} else
result += decoded_char;
} else {
// recover from error by not decoding character
result += '%';
}
break;
default:
// character does not need to be escaped
result += str[pos];
}
};
return result;
}
static std::string url_encode(const std::string& str)
{
char encode_buf[4];
std::string result;
encode_buf[0] = '%';
result.reserve(str.size());
// character selection for this algorithm is based on the following url:
// http://www.blooberry.com/indexdot/html/topics/urlencoding.htm
for (std::string::size_type pos = 0; pos < str.size(); ++pos) {
switch(str[pos]) {
default:
if (str[pos] > 32 && str[pos] < 127) {
// character does not need to be escaped
result += str[pos];
break;
}
// else pass through to next case
case ' ':
case '$': case '&': case '+': case ',': case '/': case ':':
case ';': case '=': case '?': case '@': case '"': case '<':
case '>': case '#': case '%': case '{': case '}': case '|':
case '\\': case '^': case '~': case '[': case ']': case '`':
// the character needs to be encoded
sprintf(encode_buf+1, "%.2X", (unsigned char)(str[pos]));
result += encode_buf;
break;
}
};
return result;
}
static std::string xml_encode(const std::string& str)
{
std::string result;
result.reserve(str.size() + 20); // Assume ~5 characters converted (length increases)
const unsigned char *ptr = reinterpret_cast<const unsigned char*>(str.c_str());
const unsigned char *end_ptr = ptr + str.size();
while (ptr < end_ptr) {
// check byte ranges for valid UTF-8
// see http://en.wikipedia.org/wiki/UTF-8
// also, see http://www.w3.org/TR/REC-xml/#charsets
// this implementation is the strictest subset of both
if ((*ptr >= 0x20 && *ptr <= 0x7F) || *ptr == 0x9 || *ptr == 0xa || *ptr == 0xd) {
// regular ASCII character
switch(*ptr) {
// Escape special XML characters.
case '&':
result += "&amp;";
break;
case '<':
result += "&lt;";
break;
case '>':
result += "&gt;";
break;
case '\"':
result += "&quot;";
break;
case '\'':
result += "&apos;";
break;
default:
result += *ptr;
}
} else if (*ptr >= 0xC2 && *ptr <= 0xDF) {
// two-byte sequence
if (*(ptr+1) >= 0x80 && *(ptr+1) <= 0xBF) {
result += *ptr;
result += *(++ptr);
} else {
// insert replacement char
result += 0xef;
result += 0xbf;
result += 0xbd;
}
} else if (*ptr >= 0xE0 && *ptr <= 0xEF) {
// three-byte sequence
if (*(ptr+1) >= 0x80 && *(ptr+1) <= 0xBF
&& *(ptr+2) >= 0x80 && *(ptr+2) <= 0xBF) {
result += *ptr;
result += *(++ptr);
result += *(++ptr);
} else {
// insert replacement char
result += 0xef;
result += 0xbf;
result += 0xbd;
}
} else if (*ptr >= 0xF0 && *ptr <= 0xF4) {
// four-byte sequence
if (*(ptr+1) >= 0x80 && *(ptr+1) <= 0xBF
&& *(ptr+2) >= 0x80 && *(ptr+2) <= 0xBF
&& *(ptr+3) >= 0x80 && *(ptr+3) <= 0xBF) {
result += *ptr;
result += *(++ptr);
result += *(++ptr);
result += *(++ptr);
} else {
// insert replacement char
result += 0xef;
result += 0xbf;
result += 0xbd;
}
} else {
// insert replacement char
result += 0xef;
result += 0xbf;
result += 0xbd;
}
++ptr;
}
return result;
}
template <typename Str>
struct tokenizer
{
tokenizer(Str const& str)
: _string(str), _offset(0), _delimiters(" ")
{}
tokenizer(Str const& str, Str const& delimiters)
: _string(str), _offset(0), _delimiters(delimiters)
{}
bool next_token()
{
return next_token(_delimiters);
}
bool next_token(Str const& delimiters)
{
size_t i = _string.find_first_not_of(delimiters, _offset);
if (i == Str::npos)
{
_offset = _string.length();
return false;
}
size_t j = _string.find_first_of(delimiters, i);
if (j == Str::npos) {
_token = _string.substr(i);
_offset = _string.length();
return true;
}
_token = _string.substr(i, j - i);
_offset = j;
return true;
}
const Str get_token() const
{
return _token;
}
void reset()
{
_offset = 0;
}
size_t _offset;
const Str _string;
Str _token;
Str _delimiters;
};
}
#endif // UTIL_STRING_UTILS_HPP
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment