Created
May 25, 2017 16:18
-
-
Save maksis/92ad04f525d69043283350675d04f160 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <iostream> | |
#include <set> | |
#include <string> | |
#include <vector> | |
#include <sys/time.h> | |
double get_current_time() { | |
struct timeval t; | |
gettimeofday(&t, 0); | |
return t.tv_sec + t.tv_usec*1e-6; | |
} | |
std::string random_string(size_t length) { | |
auto randchar = []() -> char | |
{ | |
const char charset[] = | |
"0123456789" | |
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
"abcdefghijklmnopqrstuvwxyz"; | |
const size_t max_index = (sizeof(charset) - 1); | |
return charset[ rand() % max_index ]; | |
}; | |
std::string str(length,0); | |
std::generate_n( str.begin(), length, randchar ); | |
return str; | |
} | |
int utf8ToWc(const char* str, wchar_t& c) { | |
const auto c0 = static_cast<uint8_t>(str[0]); | |
const auto bytes = 2 + !!(c0 & 0x20) + ((c0 & 0x30) == 0x30); | |
if ((c0 & 0xc0) == 0xc0) { // 11xx xxxx | |
// # bytes of leading 1's; check for 0 next | |
const auto check_bit = 1 << (7 - bytes); | |
if (c0 & check_bit) | |
return -1; | |
c = (check_bit - 1) & c0; | |
// 2-4 total, or 1-3 additional, bytes | |
// Can't run off end of str so long as has sub-0x80-terminator | |
for (auto i = 1; i < bytes; ++i) { | |
const auto ci = static_cast<uint8_t>(str[i]); | |
if ((ci & 0xc0) != 0x80) | |
return -i; | |
c = (c << 6) | (ci & 0x3f); | |
} | |
// Invalid UTF-8 code points | |
if (c > 0x10ffff || (c >= 0xd800 && c <= 0xdfff)) { | |
// "REPLACEMENT CHARACTER": used to replace an incoming character | |
// whose value is unknown or unrepresentable in Unicode | |
c = 0xfffd; | |
return -bytes; | |
} | |
return bytes; | |
} else if ((c0 & 0x80) == 0) { // 0xxx xxxx | |
c = static_cast<unsigned char>(str[0]); | |
return 1; | |
} else { // 10xx xxxx | |
return -1; | |
} | |
} | |
int compareCounter = 0; | |
int noCaseSort(const char* a, const char* b) noexcept { | |
while (*a) { | |
wchar_t ca = 0, cb = 0; | |
int na = utf8ToWc(a, ca); | |
int nb = utf8ToWc(b, cb); | |
ca = towlower(ca); | |
cb = towlower(cb); | |
compareCounter++; | |
if (ca != cb) { | |
return (int)ca - (int)cb; | |
} | |
a += abs(na); | |
b += abs(nb); | |
} | |
wchar_t ca = 0, cb = 0; | |
utf8ToWc(a, ca); | |
utf8ToWc(b, cb); | |
compareCounter++; | |
return (int)towlower(ca) - (int)towlower(cb); | |
} | |
struct Sort { | |
bool operator()(const std::string& a, const std::string& b) const noexcept { | |
return noCaseSort(a.c_str(), b.c_str()) < 0; | |
} | |
}; | |
#define COUNT 1000000 | |
#define STR_LEN 90 | |
int main() { | |
std::vector<std::string> input; | |
for (int i = 0; i < COUNT; i++) { | |
input.push_back(random_string(STR_LEN)); | |
} | |
std::set<std::string, Sort> stringSet; | |
double startTime = get_current_time(); | |
for (const auto& s: input) { | |
stringSet.insert(s); | |
} | |
double endTime = get_current_time(); | |
printf("Completed in %fs, compare count %d\n", endTime - startTime, compareCounter); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment