Last active
November 20, 2018 00:50
-
-
Save shivanshu3/ca29776ede10bd0541a3bca4c304281b to your computer and use it in GitHub Desktop.
Using SIMD instrinsics in MSVC. SIMD on a single thread gave us a 29x performance boost. Parallel SIMD gave us a 43x boost! I'm using Intel i5-4670K.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdafx.h" | |
#include <iostream> | |
#include <string> | |
#include <chrono> | |
#include <random> | |
#include <thread> | |
#include <Windows.h> | |
#include <immintrin.h> | |
// Allocates 32 byte aligned memory. 32 bytes == 256 bits. | |
__m256i* Alloc256(uint64_t numBytes) | |
{ | |
return reinterpret_cast<__m256i*>(_aligned_malloc(numBytes, 32)); | |
} | |
// Fills array with random numbers | |
void FillArrayWithNums(uint8_t* arr, uint64_t numBytes) | |
{ | |
std::mt19937 rng; | |
rng.seed(std::random_device()()); | |
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 255); | |
for (uint64_t i = 0; i < numBytes; ++i) | |
{ | |
arr[i] = dist(rng); | |
} | |
} | |
using T_TimePoint = decltype(std::chrono::high_resolution_clock::now()); | |
T_TimePoint g_startTimePoint; | |
T_TimePoint g_stopTimePoint; | |
void StartTimer() | |
{ | |
g_startTimePoint = std::chrono::high_resolution_clock::now(); | |
} | |
void StopTimer() | |
{ | |
g_stopTimePoint = std::chrono::high_resolution_clock::now(); | |
} | |
// Us == microseconds | |
uint64_t CalculateTimeUs() | |
{ | |
return std::chrono::duration_cast<std::chrono::microseconds>(g_stopTimePoint - g_startTimePoint).count(); | |
} | |
uint8_t AddUsingSIMD(const __m256i* mem, uint64_t numBytes) | |
{ | |
__m256i sum{}; | |
uint8_t finalSum{}; | |
auto numBytes_Div32 = numBytes / 32; | |
for (uint64_t i = 0; i < numBytes_Div32; ++i) | |
{ | |
sum = _mm256_add_epi8(sum, mem[i]); | |
} | |
// Add the 32 bytes inside sum | |
{ | |
auto* sumPtr1 = ∑ | |
auto* sumPtr2 = reinterpret_cast<uint8_t*>(sumPtr1); | |
uint8_t smallSum{}; | |
for (int i = 0; i < 32; ++i) | |
{ | |
smallSum += sumPtr2[i]; | |
} | |
finalSum = smallSum; | |
} | |
return finalSum; | |
} | |
bool ExistsAvx2() | |
{ | |
auto* mem = reinterpret_cast<__m256i*>(_aligned_malloc(32, 32)); | |
auto illegalInstructionFilter = [](DWORD exceptionCode) -> bool | |
{ | |
return exceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; | |
}; | |
__try | |
{ | |
auto sum = _mm256_add_epi8(*mem, *mem); | |
// Make sure the line above doesn't get optimized away | |
{ | |
volatile uint8_t sink = *reinterpret_cast<uint8_t*>(&sum); | |
} | |
} | |
__except (illegalInstructionFilter(GetExceptionCode())) | |
{ | |
return false; | |
} | |
return true; | |
} | |
int main() | |
{ | |
if (!ExistsAvx2()) | |
{ | |
std::wcerr << L"Error: AVX-2 instruction set is missing from the CPU" << std::endl; | |
return 1; | |
} | |
uint64_t sizeMem = static_cast<uint64_t>(1) * static_cast<uint64_t>(1024) * static_cast<uint64_t>(1024) * static_cast<uint64_t>(1024); | |
auto* mem32Byte = Alloc256(sizeMem); | |
auto* mem1Byte = reinterpret_cast<uint8_t*>(mem32Byte); | |
FillArrayWithNums(mem1Byte, sizeMem); | |
// 1 byte at a time - no automatic SIMD optimizations (sum is volatile) | |
{ | |
StartTimer(); | |
volatile uint8_t sum{}; | |
for (uint64_t i = 0; i < sizeMem; ++i) | |
{ | |
sum += mem1Byte[i]; | |
} | |
StopTimer(); | |
std::wcout << L"No SIMD" << std::endl; | |
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl; | |
std::wcout << std::endl; | |
} | |
// Automatic SIMD optimizations (sum is not volatile) | |
{ | |
StartTimer(); | |
uint8_t sum{}; | |
for (uint64_t i = 0; i < sizeMem; ++i) | |
{ | |
sum += mem1Byte[i]; | |
} | |
StopTimer(); | |
std::wcout << L"Automatic SIMD optimization" << std::endl; | |
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl; | |
std::wcout << std::endl; | |
} | |
// Explicitly using SIMD on a single thread | |
{ | |
StartTimer(); | |
auto sum = AddUsingSIMD(mem32Byte, sizeMem); | |
StopTimer(); | |
std::wcout << L"Explicit SIMD - single threaded" << std::endl; | |
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl; | |
std::wcout << std::endl; | |
} | |
// Use 4 threads - explicit parallel SIMD | |
{ | |
StartTimer(); | |
auto incSize = (sizeMem / 32) / 4; | |
auto adderFunc = [incSize](const __m256i* mem, uint8_t* result) | |
{ | |
*result = AddUsingSIMD(mem, incSize * 32); | |
}; | |
std::vector<std::thread> threads; | |
std::vector<uint8_t> sums(4); | |
for (int i = 0; i < 4; ++i) | |
{ | |
auto* mem = &mem32Byte[i * incSize]; | |
std::thread t{ adderFunc, mem, &sums[i] }; | |
threads.push_back(std::move(t)); | |
} | |
for (int i = 0; i < 4; ++i) | |
{ | |
threads[i].join(); | |
} | |
uint8_t sum{}; | |
for (int i = 0; i < 4; ++i) | |
{ | |
sum += sums[i]; | |
} | |
StopTimer(); | |
std::wcout << L"Explicit SIMD - 4 threads" << std::endl; | |
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl; | |
std::wcout << std::endl; | |
} | |
return 0; | |
} | |
// Sample output: | |
// | |
// No SIMD | |
// Sum: 184, time: 2244852 uS | |
// | |
// Automatic SIMD optimization | |
// Sum: 184, time: 78150 uS | |
// | |
// Explicit SIMD - single threaded | |
// Sum: 184, time: 77684 uS | |
// | |
// Explicit SIMD - 4 threads | |
// Sum: 184, time: 52665 uS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment