Created
October 19, 2016 01:48
-
-
Save chris-b1/59743744458a2e0a628f84a817b14a50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <numeric> | |
#include <vector> | |
#include <iostream> | |
#include <string> | |
#include <random> | |
#include <chrono> | |
#include <algorithm> | |
#include <cstdint> | |
#include <cmath> | |
#include <memory> | |
#include <cassert> | |
#include <intrin.h> | |
static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; | |
static inline bool GetBit(const uint8_t* bits, int i) { | |
return static_cast<bool>(bits[i / 8] & kBitmask[i % 8]); | |
} | |
static inline int64_t CeilByte(int64_t size) { | |
return (size + 7) & ~7; | |
} | |
static inline int64_t BytesForBits(int64_t size) { | |
return CeilByte(size) / 8; | |
} | |
static inline void SetBit(uint8_t* bits, int i) { | |
bits[i / 8] |= kBitmask[i % 8]; | |
} | |
struct Values { | |
Values(size_t N, double percent_missing) : mask(new uint8_t[BytesForBits(N)]) | |
{ | |
memset(mask.get(), 0, BytesForBits(N)); | |
std::uniform_real_distribution<double> uniform(0., 1.); | |
std::uniform_real_distribution<double> values(0., 100000.); | |
std::default_random_engine eng(42); | |
for (int i = 0; i < N; ++i) { | |
if (uniform(eng) < percent_missing) { | |
// std::cout << "set null " << i << '\n'; | |
SetBit(mask.get(), i); | |
data.push_back(std::nan("")); | |
} | |
else { | |
data.push_back(uniform(eng)); | |
} | |
} | |
} | |
std::vector<double> data; | |
std::unique_ptr<uint8_t[]> mask; | |
}; | |
double sum_with_nan(double *data, size_t size) { | |
double ans = 0.0; | |
for (int i = 0; i < size; ++i) { | |
double val = data[i]; | |
if (val == val) { | |
ans += val; | |
} | |
} | |
return ans; | |
} | |
template <int NumBlocks> | |
uint64_t popcount(uint64_t* data) { | |
// don't make it here | |
} | |
template <> | |
uint64_t popcount<1>(uint64_t* data) { | |
return _mm_popcnt_u64(data[0]); | |
} | |
template <> | |
uint64_t popcount<2>(uint64_t* data) { | |
return _mm_popcnt_u64(data[0]) + _mm_popcnt_u64(data[1]); | |
} | |
template <> | |
uint64_t popcount<4>(uint64_t* data) { | |
return (_mm_popcnt_u64(data[0]) + _mm_popcnt_u64(data[1]) + | |
_mm_popcnt_u64(data[2]) + _mm_popcnt_u64(data[3])); | |
} | |
template <int NumBlocks> | |
double sum_with_bitmask(double *data, uint8_t* mask, size_t size) { | |
double ans = 0.0; | |
constexpr size_t block_size = (64 * NumBlocks); | |
size_t num_blocks = size / block_size; | |
for (int block = 0; block < num_blocks; ++block) { | |
uint64_t null_count = popcount<NumBlocks>(reinterpret_cast<uint64_t*>(mask + block * block_size / 8)); | |
// uint64_t null_count = __popcnt64(*reinterpret_cast<int64_t*>(mask + block * 8)); | |
// std::cout << "null_count = " << null_count << '\n'; | |
if (null_count) { | |
for (int i = 0; i < block_size; ++i) { | |
if (!GetBit(mask, i + block * block_size)) { | |
//std::cout << "checking and summing element i = " << i + block * block_size << '\n'; | |
ans += data[i + block * block_size]; | |
} | |
} | |
} | |
else { | |
for (int i = 0; i < block_size; ++i) { | |
// std::cout << "summing element i = " << i + block * block_size << '\n'; | |
ans += data[i + block * block_size]; | |
} | |
} | |
} | |
// remainder of elements after complete blocks | |
for (int j = 0; j < size % block_size; ++j) { | |
// std::cout << "summing element j = " << j + num_blocks * 64 << '\n'; | |
if (!GetBit(mask, j + num_blocks * block_size)) { | |
//std::cout << "summing element j = " << j + num_blocks * block_size << '\n'; | |
ans += data[j + num_blocks * block_size]; | |
} | |
} | |
return ans; | |
} | |
template <int NumBlocks> | |
void timeit(size_t N, double percent_missing) { | |
Values v(N, percent_missing); | |
auto start = std::chrono::system_clock::now(); | |
auto a = sum_with_bitmask<NumBlocks>(v.data.data(), v.mask.get(), v.data.size()); | |
auto end = std::chrono::system_clock::now(); | |
auto bitmask_duration = (float)(std::chrono::duration_cast<std::chrono::milliseconds> (end - start).count()); | |
start = std::chrono::system_clock::now(); | |
auto b = sum_with_nan(v.data.data(), v.data.size()); | |
end = std::chrono::system_clock::now(); | |
auto nan_duration = (float) (std::chrono::duration_cast<std::chrono::milliseconds> (end - start).count()); | |
if (a != b) { | |
std::cout << "fail!\n"; | |
} | |
std::cout << "N = " << N << " missing = " << percent_missing << " blocks = " << NumBlocks << '\n'; | |
std::cout << "Bitmask / NAN = " << bitmask_duration / nan_duration << '\n'; | |
} | |
int main() | |
{ | |
timeit<1>(100000000, 0.00); | |
timeit<2>(100000000, 0.00); | |
timeit<4>(100000000, 0.00); | |
timeit<1>(100000000, 0.01); | |
timeit<2>(100000000, 0.01); | |
timeit<4>(100000000, 0.01); | |
timeit<1>(100000000, 0.05); | |
timeit<2>(100000000, 0.05); | |
timeit<4>(100000000, 0.05); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment