Skip to content

Instantly share code, notes, and snippets.

@daramkun
Created July 3, 2019 15:17
Show Gist options
  • Save daramkun/456d81800ef076a23caed52f3dd7f808 to your computer and use it in GitHub Desktop.
Save daramkun/456d81800ef076a23caed52f3dd7f808 to your computer and use it in GitHub Desktop.
Memory Copy Performance Measure (memcpy, ID3D11DeviceContext::CopyResource)
#include <Windows.h>
#include <d3d11.h>
#include <atlbase.h>
#pragma comment (lib, "d3d11.lib")
#include <iostream>
#include <thread>
#include <chrono>
#include <memory>
#include <vector>
#include <cstdint>
#include <cassert>
#include <algorithm>
#include <execution>
constexpr double MEASURE_SECONDS = 10;
constexpr double GIGABYTE_MAKER = 1 / 1073741824.0;
class performance
{
public:
performance () : _copy_bytes (0), _running (false) { }
virtual ~performance () noexcept
{
_run.join ();
}
public:
uint64_t copy_bytes () const noexcept { return _copy_bytes; }
std::chrono::duration<double> proceed_time () const noexcept
{
return std::chrono::high_resolution_clock::now () - _started;
}
bool is_running () const noexcept { return _running; }
protected:
virtual size_t do_measure () noexcept = 0;
public:
void run () noexcept
{
_running = true;
_started = std::chrono::high_resolution_clock::now ();
_run = std::thread ([this]()
{
_copy_bytes = 0;
do
{
_copy_bytes += this->do_measure ();
std::this_thread::yield ();
}
while (_running);
}
);
}
void stop () { _running = false; }
private:
uint64_t _copy_bytes;
std::chrono::steady_clock::time_point _started;
std::thread _run;
bool _running;
};
class memcpy_performance : public performance
{
private:
const size_t BUFFER_SIZE = 1024 * 1024 * 16; //< 16MB
public:
memcpy_performance ()
{
_dest.resize (BUFFER_SIZE);
_src.resize (BUFFER_SIZE);
for (unsigned int i = 0; i < std::thread::hardware_concurrency (); ++i)
_temp.push_back (i);
}
protected:
virtual size_t do_measure () noexcept override
{
std::for_each (std::execution::par_unseq, _temp.begin (), _temp.end (), [this](unsigned int i)
{
memcpy (_dest.data (), _src.data (), BUFFER_SIZE);
});
return BUFFER_SIZE * _temp.size ();
}
private:
std::vector<uint8_t> _dest, _src;
std::vector<unsigned int> _temp;
};
class D3D11CopyResourceRAM2VRAM_performance : public performance
{
public:
D3D11CopyResourceRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM)
{
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext);
assert (SUCCEEDED (hr));
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC));
_texDesc.Width = _texDesc.Height = size;
_texDesc.ArraySize = 1;
_texDesc.MipLevels = 1;
_texDesc.Format = format;
_texDesc.SampleDesc.Count = 1;
_texDesc.Usage = D3D11_USAGE_DEFAULT;
_texDesc.CPUAccessFlags = 0;
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest);
assert (SUCCEEDED (hr));
_texDesc.Usage = D3D11_USAGE_STAGING;
_texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
_texDesc.BindFlags = 0;
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src);
assert (SUCCEEDED (hr));
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16);
}
protected:
virtual size_t do_measure () noexcept override
{
_immediateContext->CopyResource (_dest, _src);
_immediateContext->Flush ();
return _totalSize;
}
private:
D3D11_TEXTURE2D_DESC _texDesc;
CComPtr<ID3D11Device> _d3dDevice;
CComPtr<ID3D11DeviceContext> _immediateContext;
CComPtr<ID3D11Texture2D> _dest, _src;
size_t _totalSize;
};
class D3D11CopyResourceVRAM2VRAM_performance : public performance
{
public:
D3D11CopyResourceVRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM)
{
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext);
assert (SUCCEEDED (hr));
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC));
_texDesc.Width = _texDesc.Height = size;
_texDesc.ArraySize = 1;
_texDesc.MipLevels = 1;
_texDesc.Format = format;
_texDesc.SampleDesc.Count = 1;
_texDesc.Usage = D3D11_USAGE_DEFAULT;
_texDesc.CPUAccessFlags = 0;
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest);
assert (SUCCEEDED (hr));
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src);
assert (SUCCEEDED (hr));
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16);
}
protected:
virtual size_t do_measure () noexcept override
{
_immediateContext->CopyResource (_dest, _src);
_immediateContext->Flush ();
return _totalSize;
}
private:
D3D11_TEXTURE2D_DESC _texDesc;
CComPtr<ID3D11Device> _d3dDevice;
CComPtr<ID3D11DeviceContext> _immediateContext;
CComPtr<ID3D11Texture2D> _dest, _src;
size_t _totalSize;
};
void measure (const char * testname, performance* perf)
{
printf ("==== %s Performance Measure ====\n", testname);
std::shared_ptr<performance> _measure (perf);
_measure->run ();
while (_measure->is_running ())
{
if (_measure->proceed_time ().count () >= MEASURE_SECONDS)
_measure->stop ();
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB",
_measure->proceed_time ().count (),
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER,
_measure->copy_bytes () * GIGABYTE_MAKER);
std::this_thread::yield ();
}
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB",
_measure->proceed_time ().count (),
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER,
_measure->copy_bytes ()* GIGABYTE_MAKER);
putchar ('\n');
}
int main (int argc, char* argv[])
{
measure (u8"CPU memcpy", new memcpy_performance ());
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM));
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT));
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM));
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT));
return 0;
}
@alexmercerind
Copy link

CPU: AMD Ryzen 3 2200U
RAM: Crucial DDR4 4GBx2
M/B: Lenovo ideapad 330S-15ARR (81FB)
GPU: AMD Radeon Vega 3 Mobile Graphics

Result:

==== CPU memcpy Performance Measure ====
10.007s... 5.233843GB/s... Total Copied: 52.375000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 6.624958GB/s... Total Copied: 66.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 7.474972GB/s... Total Copied: 74.750000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 10.406215GB/s... Total Copied: 104.062500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 8.499961GB/s... Total Copied: 85.000000GB

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment