Skip to content

Instantly share code, notes, and snippets.

@vk2gpu
Last active April 20, 2018 18:04
Show Gist options
  • Save vk2gpu/9ddc828081d4471706bd6cc93462b9ad to your computer and use it in GitHub Desktop.
Save vk2gpu/9ddc828081d4471706bd6cc93462b9ad to your computer and use it in GitHub Desktop.
#include <vector>
#include <cstdio>
#include <cstdint>
#include <cmath>
#include <xmmintrin.h>
namespace Soa
{
template<typename TYPE, typename... ARGS>
void Load(TYPE&, ARGS...);
template<typename TYPE, typename... ARGS>
void Store(const TYPE& val, ARGS&...);
namespace Detail
{
template<typename... STORAGE_TYPE>
struct StorageImpl;
template<typename STORAGE_TYPE>
struct StorageImpl<STORAGE_TYPE>
{
std::vector<STORAGE_TYPE> values_;
void resize(size_t size) { values_.resize(size); }
size_t size() const { return values_.size(); }
void get(STORAGE_TYPE** out) { *out = values_.data(); }
void get(const STORAGE_TYPE** out) const { *out = values_.data(); }
template<typename TYPE, typename... ARGS>
void load(size_t idx, TYPE& value, ARGS... args) const
{
Soa::Load(value, args..., values_[idx]);
}
template<typename TYPE, typename... ARGS>
void store(size_t idx, const TYPE& value, ARGS&... args)
{
Soa::Store(value, args..., values_[idx]);
}
};
template<typename STORAGE_TYPE, typename... STORAGE_TYPES>
struct StorageImpl<STORAGE_TYPE, STORAGE_TYPES...>
{
std::vector<STORAGE_TYPE> values_;
StorageImpl<STORAGE_TYPES...> next_;
void resize(size_t size)
{
values_.resize(size);
next_.resize(size);
}
size_t size() const { return values_.size(); }
void get(STORAGE_TYPE** out, STORAGE_TYPES**... outs)
{
*out = values_.data();
next_.get(outs...);
}
void get(const STORAGE_TYPE** out, const STORAGE_TYPES**... outs) const
{
*out = values_.data();
next_.get(outs...);
}
template<typename TYPE, typename... ARGS>
void load(size_t idx, TYPE& value, ARGS... args) const
{
next_.template load<>(idx, value, args..., values_[idx]);
}
template<typename TYPE, typename... ARGS>
void store(size_t idx, const TYPE& value, ARGS&... args)
{
next_.template store<>(idx, value, args..., values_[idx]);
}
};
}
template<typename TYPE, typename... STORAGE_TYPES>
class Storage
{
public:
Storage() {}
void resize(size_t size) { storage_.resize(size); }
size_t size() const { return storage_.size(); }
void load(size_t idx, TYPE& value) const { storage_.template load<>(idx, value); }
void store(size_t idx, const TYPE& value) { storage_.template store<>(idx, value); }
void get(STORAGE_TYPES**... outs) { storage_.get(outs...); }
void get(const STORAGE_TYPES**... outs) const { storage_.get(outs...); }
private:
Detail::StorageImpl<STORAGE_TYPES...> storage_;
};
} // namespace Soa
struct MyGameObject
{
float x = 0.0f;
float y = 0.0f;
float z = 0.0f;
float someArray[16];
int someInt = 2;
bool someBool = false;
};
namespace Soa
{
template<>
inline void Load(MyGameObject& val, float x, float y, float z)
{
val.x = x;
val.y = y;
val.z = z;
}
template<>
inline void Store(const MyGameObject& val, float& x, float& y, float& z)
{
x = val.x;
y = val.y;
z = val.z;
}
}
using MyGameObjectSoa = Soa::Storage<MyGameObject, float, float, float>;
using MyGameObjectAos = std::vector<MyGameObject>;
void VertexProcessingAos(const MyGameObjectAos& inputA, const MyGameObjectAos& inputB, MyGameObjectAos& output)
{
size_t num = inputA.size();
size_t numSimd = num - (num % 4);
size_t idx = 0;
const MyGameObject* inA = inputA.data();
const MyGameObject* inB = inputB.data();
for(idx = 0; idx < numSimd; idx += 4)
{
__m128 i1X = _mm_set_ps(inA[3].x, inA[2].x, inA[1].x, inA[0].x);
__m128 i2X = _mm_set_ps(inB[3].x, inB[2].x, inB[1].x, inB[0].x);
__m128 i1Y = _mm_set_ps(inA[3].y, inA[2].y, inA[1].y, inA[0].y);
__m128 i2Y = _mm_set_ps(inB[3].y, inB[2].y, inB[1].y, inB[0].y);
__m128 i1Z = _mm_set_ps(inA[3].z, inA[2].z, inA[1].z, inA[0].z);
__m128 i2Z = _mm_set_ps(inB[3].z, inB[2].z, inB[1].z, inB[0].z);
__m128 oX = _mm_add_ps(i1X, i2X);
__m128 oY = _mm_add_ps(i1Y, i2Y);
__m128 oZ = _mm_add_ps(i1Z, i2Z);
output[idx + 0].x = _mm_cvtss_f32(oX);
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 1].x = _mm_cvtss_f32(oX);
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 2].x = _mm_cvtss_f32(oX);
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 3].x = _mm_cvtss_f32(oX);
oX = _mm_shuffle_ps(oX, oX, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 0].y = _mm_cvtss_f32(oY);
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 1].y = _mm_cvtss_f32(oY);
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 2].y = _mm_cvtss_f32(oY);
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 3].y = _mm_cvtss_f32(oY);
oY = _mm_shuffle_ps(oY, oY, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 0].z = _mm_cvtss_f32(oZ);
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 1].z = _mm_cvtss_f32(oZ);
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 2].z = _mm_cvtss_f32(oZ);
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1));
output[idx + 3].z = _mm_cvtss_f32(oZ);
oZ = _mm_shuffle_ps(oZ, oZ, _MM_SHUFFLE(0, 3, 2, 1));
inA += 4;
inB += 4;
}
}
void VertexProcessingSoa(const MyGameObjectSoa& inputA, const MyGameObjectSoa& inputB, MyGameObjectSoa& output)
{
size_t num = inputA.size();
size_t numSimd = num - (num % 4);
size_t idx = 0;
const float* _i1X = nullptr;
const float* _i1Y = nullptr;
const float* _i1Z = nullptr;
const float* _i2X = nullptr;
const float* _i2Y = nullptr;
const float* _i2Z = nullptr;
float* _oX = nullptr;
float* _oY = nullptr;
float* _oZ = nullptr;
inputA.get(&_i1X, &_i1Y, &_i1Z);
inputB.get(&_i2X, &_i2Y, &_i2Z);
output.get(&_oX, &_oY, &_oZ);
for(idx = 0; idx < numSimd; idx += 4)
{
__m128 i1X = _mm_load_ps(_i1X);
__m128 i2X = _mm_load_ps(_i2X);
__m128 i1Y = _mm_load_ps(_i1Y);
__m128 i2Y = _mm_load_ps(_i2Y);
__m128 i1Z = _mm_load_ps(_i1Z);
__m128 i2Z = _mm_load_ps(_i2Z);
__m128 oX = _mm_add_ps(i1X, i2X);
__m128 oY = _mm_add_ps(i1Y, i2Y);
__m128 oZ = _mm_add_ps(i1Z, i2Z);
_mm_store_ps(_oX, oX);
_mm_store_ps(_oY, oY);
_mm_store_ps(_oZ, oZ);
_i1X += 4;
_i1Y += 4;
_i1Z += 4;
_i2X += 4;
_i2Y += 4;
_i2Z += 4;
_oX += 4;
_oY += 4;
_oZ += 4;
}
}
struct RDTSCTimer
{
uint64_t begin_ = 0;
uint64_t end_ = 0;
void Start() { begin_ = __rdtsc(); }
void Stop() { end_ = __rdtsc(); }
uint64_t GetCycles() const { return end_ - begin_; }
double GetAvgCycles(uint64_t rate) const { return (double)GetCycles() / (double)rate; }
};
int main(int argc, char* const argv[])
{
MyGameObjectAos inputA_Aos;
MyGameObjectAos inputB_Aos;
MyGameObjectAos output_Aos;
MyGameObjectSoa inputA_Soa;
MyGameObjectSoa inputB_Soa;
MyGameObjectSoa output_Soa;
#ifdef DEBUG
size_t numVertices = 8;
#else
size_t numVertices = 1024 * 1024 * 32;
#endif
inputA_Aos.resize(numVertices);
inputB_Aos.resize(numVertices);
output_Aos.resize(numVertices);
inputA_Soa.resize(numVertices);
inputB_Soa.resize(numVertices);
output_Soa.resize(numVertices);
auto GetRandFloat = []() -> float {
int val = rand() % 4096;
return (float)val / 4096.0f;
};
auto GetRandGameObject = [&GetRandFloat]() -> MyGameObject {
MyGameObject obj;
obj.x = GetRandFloat();
obj.y = GetRandFloat();
obj.z = GetRandFloat();
return obj;
};
for(size_t idx = 0; idx < numVertices; ++idx)
{
inputA_Aos[idx] = GetRandGameObject();
inputB_Aos[idx] = GetRandGameObject();
inputA_Soa.store(idx, inputA_Aos[idx]);
inputB_Soa.store(idx, inputB_Aos[idx]);
}
RDTSCTimer timer;
// Time SOA.
{
timer.Start();
VertexProcessingAos(inputA_Aos, inputB_Aos, output_Aos);
timer.Stop();
printf("AOS: %llu cycles (%f per vert)\n", timer.GetCycles(), timer.GetAvgCycles(numVertices));
}
// Time AOS.
{
timer.Start();
VertexProcessingSoa(inputA_Soa, inputB_Soa, output_Soa);
timer.Stop();
printf("SOA: %llu cycles (%f per vert)\n", timer.GetCycles(), timer.GetAvgCycles(numVertices));
}
// Validate.
float* _oX = nullptr;
float* _oY = nullptr;
float* _oZ = nullptr;
output_Soa.get(&_oX, &_oY, &_oZ);
auto FloatCompare = [](float a, float b, float ep) -> bool
{
return std::abs(a - b) < ep;
};
for(size_t idx = 0; idx < numVertices; ++idx)
{
if(!FloatCompare(_oX[idx], output_Aos[idx].x, 1e-24f) ||
!FloatCompare(_oY[idx], output_Aos[idx].y, 1e-24f) ||
!FloatCompare(_oZ[idx], output_Aos[idx].z, 1e-24f))
{
printf("Output differs :(\n");
abort();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment