Last active
April 20, 2018 18:04
-
-
Save vk2gpu/9ddc828081d4471706bd6cc93462b9ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <cstdio> | |
#include <cstdint> | |
#include <cmath> | |
#include <xmmintrin.h> | |
namespace Soa | |
{ | |
template<typename TYPE, typename... ARGS> | |
void Load(TYPE&, ARGS...); | |
template<typename TYPE, typename... ARGS> | |
void Store(const TYPE& val, ARGS&...); | |
namespace Detail | |
{ | |
template<typename... STORAGE_TYPE> | |
struct StorageImpl; | |
template<typename STORAGE_TYPE> | |
struct StorageImpl<STORAGE_TYPE> | |
{ | |
std::vector<STORAGE_TYPE> values_; | |
void resize(size_t size) { values_.resize(size); } | |
size_t size() const { return values_.size(); } | |
void get(STORAGE_TYPE** out) { *out = values_.data(); } | |
void get(const STORAGE_TYPE** out) const { *out = values_.data(); } | |
template<typename TYPE, typename... ARGS> | |
void load(size_t idx, TYPE& value, ARGS... args) const | |
{ | |
Soa::Load(value, args..., values_[idx]); | |
} | |
template<typename TYPE, typename... ARGS> | |
void store(size_t idx, const TYPE& value, ARGS&... args) | |
{ | |
Soa::Store(value, args..., values_[idx]); | |
} | |
}; | |
template<typename STORAGE_TYPE, typename... STORAGE_TYPES> | |
struct StorageImpl<STORAGE_TYPE, STORAGE_TYPES...> | |
{ | |
std::vector<STORAGE_TYPE> values_; | |
StorageImpl<STORAGE_TYPES...> next_; | |
void resize(size_t size) | |
{ | |
values_.resize(size); | |
next_.resize(size); | |
} | |
size_t size() const { return values_.size(); } | |
void get(STORAGE_TYPE** out, STORAGE_TYPES**... outs) | |
{ | |
*out = values_.data(); | |
next_.get(outs...); | |
} | |
void get(const STORAGE_TYPE** out, const STORAGE_TYPES**... outs) const | |
{ | |
*out = values_.data(); | |
next_.get(outs...); | |
} | |
template<typename TYPE, typename... ARGS> | |
void load(size_t idx, TYPE& value, ARGS... args) const | |
{ | |
next_.template load<>(idx, value, args..., values_[idx]); | |
} | |
template<typename TYPE, typename... ARGS> | |
void store(size_t idx, const TYPE& value, ARGS&... args) | |
{ | |
next_.template store<>(idx, value, args..., values_[idx]); | |
} | |
}; | |
} | |
template<typename TYPE, typename... STORAGE_TYPES> | |
class Storage | |
{ | |
public: | |
Storage() {} | |
void resize(size_t size) { storage_.resize(size); } | |
size_t size() const { return storage_.size(); } | |
void load(size_t idx, TYPE& value) const { storage_.template load<>(idx, value); } | |
void store(size_t idx, const TYPE& value) { storage_.template store<>(idx, value); } | |
void get(STORAGE_TYPES**... outs) { storage_.get(outs...); } | |
void get(const STORAGE_TYPES**... outs) const { storage_.get(outs...); } | |
private: | |
Detail::StorageImpl<STORAGE_TYPES...> storage_; | |
}; | |
} // namespace Soa | |
struct MyGameObject | |
{ | |
float x = 0.0f; | |
float y = 0.0f; | |
float z = 0.0f; | |
float someArray[16]; | |
int someInt = 2; | |
bool someBool = false; | |
}; | |
namespace Soa | |
{ | |
template<> | |
inline void Load(MyGameObject& val, float x, float y, float z) | |
{ | |
val.x = x; | |
val.y = y; | |
val.z = z; | |
} | |
template<> | |
inline void Store(const MyGameObject& val, float& x, float& y, float& z) | |
{ | |
x = val.x; | |
y = val.y; | |
z = val.z; | |
} | |
} | |
using MyGameObjectSoa = Soa::Storage<MyGameObject, float, float, float>; | |
using MyGameObjectAos = std::vector<MyGameObject>; | |
void VertexProcessingAos(const MyGameObjectAos& inputA, const MyGameObjectAos& inputB, MyGameObjectAos& output) | |
{ | |
size_t num = inputA.size(); | |
size_t numSimd = num - (num % 4); | |
size_t idx = 0; | |
const MyGameObject* inA = inputA.data(); | |
const MyGameObject* inB = inputB.data(); | |
for(idx = 0; idx < numSimd; idx += 4) | |
{ | |
__m128 i1X = _mm_set_ps(inA[3].x, inA[2].x, inA[1].x, inA[0].x); | |
__m128 i2X = _mm_set_ps(inB[3].x, inB[2].x, inB[1].x, inB[0].x); | |
__m128 i1Y = _mm_set_ps(inA[3].y, inA[2].y, inA[1].y, inA[0].y); | |
__m128 i2Y = _mm_set_ps(inB[3].y, inB[2].y, inB[1].y, inB[0].y); | |
__m128 i1Z = _mm_set_ps(inA[3].z, inA[2].z, inA[1].z, inA[0].z); | |
__m128 i2Z = _mm_set_ps(inB[3].z, inB[2].z, inB[1].z, inB[0].z); | |
__m128 oX = _mm_add_ps(i1X, i2X); | |
__m128 oY = _mm_add_ps(i1Y, i2Y); | |
__m128 oZ = _mm_add_ps(i1Z, i2Z); | |
output[idx + 0].x = _mm_cvtss_f32(oX); | |
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 1].x = _mm_cvtss_f32(oX); | |
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 2].x = _mm_cvtss_f32(oX); | |
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 3].x = _mm_cvtss_f32(oX); | |
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 0].y = _mm_cvtss_f32(oY); | |
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 1].y = _mm_cvtss_f32(oY); | |
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 2].y = _mm_cvtss_f32(oY); | |
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 3].y = _mm_cvtss_f32(oY); | |
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 0].z = _mm_cvtss_f32(oZ); | |
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 1].z = _mm_cvtss_f32(oZ); | |
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 2].z = _mm_cvtss_f32(oZ); | |
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1)); | |
output[idx + 3].z = _mm_cvtss_f32(oZ); | |
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1)); | |
inA += 4; | |
inB += 4; | |
} | |
} | |
void VertexProcessingSoa(const MyGameObjectSoa& inputA, const MyGameObjectSoa& inputB, MyGameObjectSoa& output) | |
{ | |
size_t num = inputA.size(); | |
size_t numSimd = num - (num % 4); | |
size_t idx = 0; | |
const float* _i1X = nullptr; | |
const float* _i1Y = nullptr; | |
const float* _i1Z = nullptr; | |
const float* _i2X = nullptr; | |
const float* _i2Y = nullptr; | |
const float* _i2Z = nullptr; | |
float* _oX = nullptr; | |
float* _oY = nullptr; | |
float* _oZ = nullptr; | |
inputA.get(&_i1X, &_i1Y, &_i1Z); | |
inputB.get(&_i2X, &_i2Y, &_i2Z); | |
output.get(&_oX, &_oY, &_oZ); | |
for(idx = 0; idx < numSimd; idx += 4) | |
{ | |
__m128 i1X = _mm_load_ps(_i1X); | |
__m128 i2X = _mm_load_ps(_i2X); | |
__m128 i1Y = _mm_load_ps(_i1Y); | |
__m128 i2Y = _mm_load_ps(_i2Y); | |
__m128 i1Z = _mm_load_ps(_i1Z); | |
__m128 i2Z = _mm_load_ps(_i2Z); | |
__m128 oX = _mm_add_ps(i1X, i2X); | |
__m128 oY = _mm_add_ps(i1Y, i2Y); | |
__m128 oZ = _mm_add_ps(i1Z, i2Z); | |
_mm_store_ps(_oX, oX); | |
_mm_store_ps(_oY, oY); | |
_mm_store_ps(_oZ, oZ); | |
_i1X += 4; | |
_i1Y += 4; | |
_i1Z += 4; | |
_i2X += 4; | |
_i2Y += 4; | |
_i2Z += 4; | |
_oX += 4; | |
_oY += 4; | |
_oZ += 4; | |
} | |
} | |
struct RDTSCTimer | |
{ | |
uint64_t begin_ = 0; | |
uint64_t end_ = 0; | |
void Start() { begin_ = __rdtsc(); } | |
void Stop() { end_ = __rdtsc(); } | |
uint64_t GetCycles() const { return end_ - begin_; } | |
double GetAvgCycles(uint64_t rate) const { return (double)GetCycles() / (double)rate; } | |
}; | |
int main(int argc, char* const argv[]) | |
{ | |
MyGameObjectAos inputA_Aos; | |
MyGameObjectAos inputB_Aos; | |
MyGameObjectAos output_Aos; | |
MyGameObjectSoa inputA_Soa; | |
MyGameObjectSoa inputB_Soa; | |
MyGameObjectSoa output_Soa; | |
#ifdef DEBUG | |
size_t numVertices = 8; | |
#else | |
size_t numVertices = 1024 * 1024 * 32; | |
#endif | |
inputA_Aos.resize(numVertices); | |
inputB_Aos.resize(numVertices); | |
output_Aos.resize(numVertices); | |
inputA_Soa.resize(numVertices); | |
inputB_Soa.resize(numVertices); | |
output_Soa.resize(numVertices); | |
auto GetRandFloat = []() -> float { | |
int val = rand() % 4096; | |
return (float)val / 4096.0f; | |
}; | |
auto GetRandGameObject = [&GetRandFloat]() -> MyGameObject { | |
MyGameObject obj; | |
obj.x = GetRandFloat(); | |
obj.y = GetRandFloat(); | |
obj.z = GetRandFloat(); | |
return obj; | |
}; | |
for(size_t idx = 0; idx < numVertices; ++idx) | |
{ | |
inputA_Aos[idx] = GetRandGameObject(); | |
inputB_Aos[idx] = GetRandGameObject(); | |
inputA_Soa.store(idx, inputA_Aos[idx]); | |
inputB_Soa.store(idx, inputB_Aos[idx]); | |
} | |
RDTSCTimer timer; | |
// Time SOA. | |
{ | |
timer.Start(); | |
VertexProcessingAos(inputA_Aos, inputB_Aos, output_Aos); | |
timer.Stop(); | |
printf("AOS: %llu cycles (%f per vert)\n", timer.GetCycles(), timer.GetAvgCycles(numVertices)); | |
} | |
// Time AOS. | |
{ | |
timer.Start(); | |
VertexProcessingSoa(inputA_Soa, inputB_Soa, output_Soa); | |
timer.Stop(); | |
printf("SOA: %llu cycles (%f per vert)\n", timer.GetCycles(), timer.GetAvgCycles(numVertices)); | |
} | |
// Validate. | |
float* _oX = nullptr; | |
float* _oY = nullptr; | |
float* _oZ = nullptr; | |
output_Soa.get(&_oX, &_oY, &_oZ); | |
auto FloatCompare = [](float a, float b, float ep) -> bool | |
{ | |
return std::abs(a - b) < ep; | |
}; | |
for(size_t idx = 0; idx < numVertices; ++idx) | |
{ | |
if(!FloatCompare(_oX[idx], output_Aos[idx].x, 1e-24f) || | |
!FloatCompare(_oY[idx], output_Aos[idx].y, 1e-24f) || | |
!FloatCompare(_oZ[idx], output_Aos[idx].z, 1e-24f)) | |
{ | |
printf("Output differs :(\n"); | |
abort(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment