Skip to content

Instantly share code, notes, and snippets.

@maksis
Created May 25, 2017 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maksis/92ad04f525d69043283350675d04f160 to your computer and use it in GitHub Desktop.
Save maksis/92ad04f525d69043283350675d04f160 to your computer and use it in GitHub Desktop.
#include <algorithm>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include <sys/time.h>
double get_current_time() {
struct timeval t;
gettimeofday(&t, 0);
return t.tv_sec + t.tv_usec*1e-6;
}
std::string random_string(size_t length) {
auto randchar = []() -> char
{
const char charset[] =
"0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz";
const size_t max_index = (sizeof(charset) - 1);
return charset[ rand() % max_index ];
};
std::string str(length,0);
std::generate_n( str.begin(), length, randchar );
return str;
}
int utf8ToWc(const char* str, wchar_t& c) {
const auto c0 = static_cast<uint8_t>(str[0]);
const auto bytes = 2 + !!(c0 & 0x20) + ((c0 & 0x30) == 0x30);
if ((c0 & 0xc0) == 0xc0) { // 11xx xxxx
// # bytes of leading 1's; check for 0 next
const auto check_bit = 1 << (7 - bytes);
if (c0 & check_bit)
return -1;
c = (check_bit - 1) & c0;
// 2-4 total, or 1-3 additional, bytes
// Can't run off end of str so long as has sub-0x80-terminator
for (auto i = 1; i < bytes; ++i) {
const auto ci = static_cast<uint8_t>(str[i]);
if ((ci & 0xc0) != 0x80)
return -i;
c = (c << 6) | (ci & 0x3f);
}
// Invalid UTF-8 code points
if (c > 0x10ffff || (c >= 0xd800 && c <= 0xdfff)) {
// "REPLACEMENT CHARACTER": used to replace an incoming character
// whose value is unknown or unrepresentable in Unicode
c = 0xfffd;
return -bytes;
}
return bytes;
} else if ((c0 & 0x80) == 0) { // 0xxx xxxx
c = static_cast<unsigned char>(str[0]);
return 1;
} else { // 10xx xxxx
return -1;
}
}
int compareCounter = 0;
int noCaseSort(const char* a, const char* b) noexcept {
while (*a) {
wchar_t ca = 0, cb = 0;
int na = utf8ToWc(a, ca);
int nb = utf8ToWc(b, cb);
ca = towlower(ca);
cb = towlower(cb);
compareCounter++;
if (ca != cb) {
return (int)ca - (int)cb;
}
a += abs(na);
b += abs(nb);
}
wchar_t ca = 0, cb = 0;
utf8ToWc(a, ca);
utf8ToWc(b, cb);
compareCounter++;
return (int)towlower(ca) - (int)towlower(cb);
}
struct Sort {
bool operator()(const std::string& a, const std::string& b) const noexcept {
return noCaseSort(a.c_str(), b.c_str()) < 0;
}
};
#define COUNT 1000000
#define STR_LEN 90
int main() {
std::vector<std::string> input;
for (int i = 0; i < COUNT; i++) {
input.push_back(random_string(STR_LEN));
}
std::set<std::string, Sort> stringSet;
double startTime = get_current_time();
for (const auto& s: input) {
stringSet.insert(s);
}
double endTime = get_current_time();
printf("Completed in %fs, compare count %d\n", endTime - startTime, compareCounter);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment