maksis/lowercase_benchmark.cpp

## lowercase_benchmark.cpp
#include <algorithm>
#include <iostream>
#include <set>
#include <string>
#include <vector>

#include <sys/time.h>


double get_current_time() {
    struct timeval t;
    gettimeofday(&t, 0);
    return t.tv_sec + t.tv_usec*1e-6;
}


std::string random_string(size_t length) {
    auto randchar = []() -> char
    {
        const char charset[] =
        "0123456789"
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "abcdefghijklmnopqrstuvwxyz";

        const size_t max_index = (sizeof(charset) - 1);
        return charset[ rand() % max_index ];
    };
    std::string str(length,0);
    std::generate_n( str.begin(), length, randchar );
    return str;
}

int utf8ToWc(const char* str, wchar_t& c) {
    const auto c0 = static_cast<uint8_t>(str[0]);
    const auto bytes = 2 + !!(c0 & 0x20) + ((c0 & 0x30) == 0x30);

    if ((c0 & 0xc0) == 0xc0) {                  // 11xx xxxx
                                                // # bytes of leading 1's; check for 0 next
        const auto check_bit = 1 << (7 - bytes);
        if (c0 & check_bit)
            return -1;

        c = (check_bit - 1) & c0;

        // 2-4 total, or 1-3 additional, bytes
        // Can't run off end of str so long as has sub-0x80-terminator
        for (auto i = 1; i < bytes; ++i) {
            const auto ci = static_cast<uint8_t>(str[i]);
            if ((ci & 0xc0) != 0x80)
                return -i;
            c = (c << 6) | (ci & 0x3f);
        }

        // Invalid UTF-8 code points
        if (c > 0x10ffff || (c >= 0xd800 && c <= 0xdfff)) {
            // "REPLACEMENT CHARACTER": used to replace an incoming character
            // whose value is unknown or unrepresentable in Unicode
            c = 0xfffd;
            return -bytes;
        }

        return bytes;
    } else if ((c0 & 0x80) == 0) {             // 0xxx xxxx
        c = static_cast<unsigned char>(str[0]);
        return 1;
    } else {                                   // 10xx xxxx
        return -1;
    }
}

int compareCounter = 0;

int noCaseSort(const char* a, const char* b) noexcept {
    while (*a) {
        wchar_t ca = 0, cb = 0;
        int na = utf8ToWc(a, ca);
        int nb = utf8ToWc(b, cb);
        ca = towlower(ca);
        cb = towlower(cb);

        compareCounter++;
        if (ca != cb) {
            return (int)ca - (int)cb;
        }
        a += abs(na);
        b += abs(nb);
    }

    wchar_t ca = 0, cb = 0;
    utf8ToWc(a, ca);
    utf8ToWc(b, cb);
    compareCounter++;

    return (int)towlower(ca) - (int)towlower(cb);
}

struct Sort {
    bool operator()(const std::string& a, const std::string& b) const noexcept {
        return noCaseSort(a.c_str(), b.c_str()) < 0;
    }
};

#define COUNT 1000000
#define STR_LEN 90

int main() {
    std::vector<std::string> input;
    for (int i = 0; i < COUNT; i++) {
        input.push_back(random_string(STR_LEN));
    }

    std::set<std::string, Sort> stringSet;

    double startTime = get_current_time();
    for (const auto& s: input) {
        stringSet.insert(s);
    }

    double endTime = get_current_time();
    printf("Completed in %fs, compare count %d\n", endTime - startTime, compareCounter);

    return 0;
}
	#include <algorithm>
	#include <iostream>
	#include <set>
	#include <string>
	#include <vector>

	#include <sys/time.h>


	double get_current_time() {
	struct timeval t;
	gettimeofday(&t, 0);
	return t.tv_sec + t.tv_usec*1e-6;
	}


	std::string random_string(size_t length) {
	auto randchar = []() -> char
	{
	const char charset[] =
	"0123456789"
	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	"abcdefghijklmnopqrstuvwxyz";

	const size_t max_index = (sizeof(charset) - 1);
	return charset[ rand() % max_index ];
	};
	std::string str(length,0);
	std::generate_n( str.begin(), length, randchar );
	return str;
	}

	int utf8ToWc(const char* str, wchar_t& c) {
	const auto c0 = static_cast<uint8_t>(str[0]);
	const auto bytes = 2 + !!(c0 & 0x20) + ((c0 & 0x30) == 0x30);

	if ((c0 & 0xc0) == 0xc0) { // 11xx xxxx
	// # bytes of leading 1's; check for 0 next
	const auto check_bit = 1 << (7 - bytes);
	if (c0 & check_bit)
	return -1;

	c = (check_bit - 1) & c0;

	// 2-4 total, or 1-3 additional, bytes
	// Can't run off end of str so long as has sub-0x80-terminator
	for (auto i = 1; i < bytes; ++i) {
	const auto ci = static_cast<uint8_t>(str[i]);
	if ((ci & 0xc0) != 0x80)
	return -i;
	c = (c << 6) \| (ci & 0x3f);
	}

	// Invalid UTF-8 code points
	if (c > 0x10ffff \|\| (c >= 0xd800 && c <= 0xdfff)) {
	// "REPLACEMENT CHARACTER": used to replace an incoming character
	// whose value is unknown or unrepresentable in Unicode
	c = 0xfffd;
	return -bytes;
	}

	return bytes;
	} else if ((c0 & 0x80) == 0) { // 0xxx xxxx
	c = static_cast<unsigned char>(str[0]);
	return 1;
	} else { // 10xx xxxx
	return -1;
	}
	}

	int compareCounter = 0;

	int noCaseSort(const char* a, const char* b) noexcept {
	while (*a) {
	wchar_t ca = 0, cb = 0;
	int na = utf8ToWc(a, ca);
	int nb = utf8ToWc(b, cb);
	ca = towlower(ca);
	cb = towlower(cb);

	compareCounter++;
	if (ca != cb) {
	return (int)ca - (int)cb;
	}
	a += abs(na);
	b += abs(nb);
	}

	wchar_t ca = 0, cb = 0;
	utf8ToWc(a, ca);
	utf8ToWc(b, cb);
	compareCounter++;

	return (int)towlower(ca) - (int)towlower(cb);
	}

	struct Sort {
	bool operator()(const std::string& a, const std::string& b) const noexcept {
	return noCaseSort(a.c_str(), b.c_str()) < 0;
	}
	};

	#define COUNT 1000000
	#define STR_LEN 90

	int main() {
	std::vector<std::string> input;
	for (int i = 0; i < COUNT; i++) {
	input.push_back(random_string(STR_LEN));
	}

	std::set<std::string, Sort> stringSet;

	double startTime = get_current_time();
	for (const auto& s: input) {
	stringSet.insert(s);
	}

	double endTime = get_current_time();
	printf("Completed in %fs, compare count %d\n", endTime - startTime, compareCounter);

	return 0;
	}