Skip to content

Instantly share code, notes, and snippets.

@shivanshu3
Last active November 20, 2018 00:50
Show Gist options
  • Save shivanshu3/ca29776ede10bd0541a3bca4c304281b to your computer and use it in GitHub Desktop.
Save shivanshu3/ca29776ede10bd0541a3bca4c304281b to your computer and use it in GitHub Desktop.
Using SIMD instrinsics in MSVC. SIMD on a single thread gave us a 29x performance boost. Parallel SIMD gave us a 43x boost! I'm using Intel i5-4670K.
#include "stdafx.h"
#include <iostream>
#include <string>
#include <chrono>
#include <random>
#include <thread>
#include <Windows.h>
#include <immintrin.h>
// Allocates 32 byte aligned memory. 32 bytes == 256 bits.
__m256i* Alloc256(uint64_t numBytes)
{
return reinterpret_cast<__m256i*>(_aligned_malloc(numBytes, 32));
}
// Fills array with random numbers
void FillArrayWithNums(uint8_t* arr, uint64_t numBytes)
{
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 255);
for (uint64_t i = 0; i < numBytes; ++i)
{
arr[i] = dist(rng);
}
}
using T_TimePoint = decltype(std::chrono::high_resolution_clock::now());
T_TimePoint g_startTimePoint;
T_TimePoint g_stopTimePoint;
void StartTimer()
{
g_startTimePoint = std::chrono::high_resolution_clock::now();
}
void StopTimer()
{
g_stopTimePoint = std::chrono::high_resolution_clock::now();
}
// Us == microseconds
uint64_t CalculateTimeUs()
{
return std::chrono::duration_cast<std::chrono::microseconds>(g_stopTimePoint - g_startTimePoint).count();
}
uint8_t AddUsingSIMD(const __m256i* mem, uint64_t numBytes)
{
__m256i sum{};
uint8_t finalSum{};
auto numBytes_Div32 = numBytes / 32;
for (uint64_t i = 0; i < numBytes_Div32; ++i)
{
sum = _mm256_add_epi8(sum, mem[i]);
}
// Add the 32 bytes inside sum
{
auto* sumPtr1 = &sum;
auto* sumPtr2 = reinterpret_cast<uint8_t*>(sumPtr1);
uint8_t smallSum{};
for (int i = 0; i < 32; ++i)
{
smallSum += sumPtr2[i];
}
finalSum = smallSum;
}
return finalSum;
}
bool ExistsAvx2()
{
auto* mem = reinterpret_cast<__m256i*>(_aligned_malloc(32, 32));
auto illegalInstructionFilter = [](DWORD exceptionCode) -> bool
{
return exceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
};
__try
{
auto sum = _mm256_add_epi8(*mem, *mem);
// Make sure the line above doesn't get optimized away
{
volatile uint8_t sink = *reinterpret_cast<uint8_t*>(&sum);
}
}
__except (illegalInstructionFilter(GetExceptionCode()))
{
return false;
}
return true;
}
int main()
{
if (!ExistsAvx2())
{
std::wcerr << L"Error: AVX-2 instruction set is missing from the CPU" << std::endl;
return 1;
}
uint64_t sizeMem = static_cast<uint64_t>(1) * static_cast<uint64_t>(1024) * static_cast<uint64_t>(1024) * static_cast<uint64_t>(1024);
auto* mem32Byte = Alloc256(sizeMem);
auto* mem1Byte = reinterpret_cast<uint8_t*>(mem32Byte);
FillArrayWithNums(mem1Byte, sizeMem);
// 1 byte at a time - no automatic SIMD optimizations (sum is volatile)
{
StartTimer();
volatile uint8_t sum{};
for (uint64_t i = 0; i < sizeMem; ++i)
{
sum += mem1Byte[i];
}
StopTimer();
std::wcout << L"No SIMD" << std::endl;
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl;
std::wcout << std::endl;
}
// Automatic SIMD optimizations (sum is not volatile)
{
StartTimer();
uint8_t sum{};
for (uint64_t i = 0; i < sizeMem; ++i)
{
sum += mem1Byte[i];
}
StopTimer();
std::wcout << L"Automatic SIMD optimization" << std::endl;
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl;
std::wcout << std::endl;
}
// Explicitly using SIMD on a single thread
{
StartTimer();
auto sum = AddUsingSIMD(mem32Byte, sizeMem);
StopTimer();
std::wcout << L"Explicit SIMD - single threaded" << std::endl;
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl;
std::wcout << std::endl;
}
// Use 4 threads - explicit parallel SIMD
{
StartTimer();
auto incSize = (sizeMem / 32) / 4;
auto adderFunc = [incSize](const __m256i* mem, uint8_t* result)
{
*result = AddUsingSIMD(mem, incSize * 32);
};
std::vector<std::thread> threads;
std::vector<uint8_t> sums(4);
for (int i = 0; i < 4; ++i)
{
auto* mem = &mem32Byte[i * incSize];
std::thread t{ adderFunc, mem, &sums[i] };
threads.push_back(std::move(t));
}
for (int i = 0; i < 4; ++i)
{
threads[i].join();
}
uint8_t sum{};
for (int i = 0; i < 4; ++i)
{
sum += sums[i];
}
StopTimer();
std::wcout << L"Explicit SIMD - 4 threads" << std::endl;
std::wcout << L"Sum: " << sum << L", time: " << CalculateTimeUs() << L" uS" << std::endl;
std::wcout << std::endl;
}
return 0;
}
// Sample output:
//
// No SIMD
// Sum: 184, time: 2244852 uS
//
// Automatic SIMD optimization
// Sum: 184, time: 78150 uS
//
// Explicit SIMD - single threaded
// Sum: 184, time: 77684 uS
//
// Explicit SIMD - 4 threads
// Sum: 184, time: 52665 uS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment