Skip to content

Instantly share code, notes, and snippets.

@daramkun
Created July 3, 2019 15:17
Show Gist options
  • Save daramkun/456d81800ef076a23caed52f3dd7f808 to your computer and use it in GitHub Desktop.
Save daramkun/456d81800ef076a23caed52f3dd7f808 to your computer and use it in GitHub Desktop.
Memory Copy Performance Measure (memcpy, ID3D11DeviceContext::CopyResource)
#include <Windows.h>
#include <d3d11.h>
#include <atlbase.h>
#pragma comment (lib, "d3d11.lib")
#include <iostream>
#include <thread>
#include <chrono>
#include <memory>
#include <vector>
#include <cstdint>
#include <cassert>
#include <algorithm>
#include <execution>
constexpr double MEASURE_SECONDS = 10;
constexpr double GIGABYTE_MAKER = 1 / 1073741824.0;
class performance
{
public:
performance () : _copy_bytes (0), _running (false) { }
virtual ~performance () noexcept
{
_run.join ();
}
public:
uint64_t copy_bytes () const noexcept { return _copy_bytes; }
std::chrono::duration<double> proceed_time () const noexcept
{
return std::chrono::high_resolution_clock::now () - _started;
}
bool is_running () const noexcept { return _running; }
protected:
virtual size_t do_measure () noexcept = 0;
public:
void run () noexcept
{
_running = true;
_started = std::chrono::high_resolution_clock::now ();
_run = std::thread ([this]()
{
_copy_bytes = 0;
do
{
_copy_bytes += this->do_measure ();
std::this_thread::yield ();
}
while (_running);
}
);
}
void stop () { _running = false; }
private:
uint64_t _copy_bytes;
std::chrono::steady_clock::time_point _started;
std::thread _run;
bool _running;
};
class memcpy_performance : public performance
{
private:
const size_t BUFFER_SIZE = 1024 * 1024 * 16; //< 16MB
public:
memcpy_performance ()
{
_dest.resize (BUFFER_SIZE);
_src.resize (BUFFER_SIZE);
for (unsigned int i = 0; i < std::thread::hardware_concurrency (); ++i)
_temp.push_back (i);
}
protected:
virtual size_t do_measure () noexcept override
{
std::for_each (std::execution::par_unseq, _temp.begin (), _temp.end (), [this](unsigned int i)
{
memcpy (_dest.data (), _src.data (), BUFFER_SIZE);
});
return BUFFER_SIZE * _temp.size ();
}
private:
std::vector<uint8_t> _dest, _src;
std::vector<unsigned int> _temp;
};
class D3D11CopyResourceRAM2VRAM_performance : public performance
{
public:
D3D11CopyResourceRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM)
{
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext);
assert (SUCCEEDED (hr));
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC));
_texDesc.Width = _texDesc.Height = size;
_texDesc.ArraySize = 1;
_texDesc.MipLevels = 1;
_texDesc.Format = format;
_texDesc.SampleDesc.Count = 1;
_texDesc.Usage = D3D11_USAGE_DEFAULT;
_texDesc.CPUAccessFlags = 0;
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest);
assert (SUCCEEDED (hr));
_texDesc.Usage = D3D11_USAGE_STAGING;
_texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
_texDesc.BindFlags = 0;
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src);
assert (SUCCEEDED (hr));
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16);
}
protected:
virtual size_t do_measure () noexcept override
{
_immediateContext->CopyResource (_dest, _src);
_immediateContext->Flush ();
return _totalSize;
}
private:
D3D11_TEXTURE2D_DESC _texDesc;
CComPtr<ID3D11Device> _d3dDevice;
CComPtr<ID3D11DeviceContext> _immediateContext;
CComPtr<ID3D11Texture2D> _dest, _src;
size_t _totalSize;
};
class D3D11CopyResourceVRAM2VRAM_performance : public performance
{
public:
D3D11CopyResourceVRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM)
{
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext);
assert (SUCCEEDED (hr));
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC));
_texDesc.Width = _texDesc.Height = size;
_texDesc.ArraySize = 1;
_texDesc.MipLevels = 1;
_texDesc.Format = format;
_texDesc.SampleDesc.Count = 1;
_texDesc.Usage = D3D11_USAGE_DEFAULT;
_texDesc.CPUAccessFlags = 0;
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest);
assert (SUCCEEDED (hr));
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src);
assert (SUCCEEDED (hr));
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16);
}
protected:
virtual size_t do_measure () noexcept override
{
_immediateContext->CopyResource (_dest, _src);
_immediateContext->Flush ();
return _totalSize;
}
private:
D3D11_TEXTURE2D_DESC _texDesc;
CComPtr<ID3D11Device> _d3dDevice;
CComPtr<ID3D11DeviceContext> _immediateContext;
CComPtr<ID3D11Texture2D> _dest, _src;
size_t _totalSize;
};
void measure (const char * testname, performance* perf)
{
printf ("==== %s Performance Measure ====\n", testname);
std::shared_ptr<performance> _measure (perf);
_measure->run ();
while (_measure->is_running ())
{
if (_measure->proceed_time ().count () >= MEASURE_SECONDS)
_measure->stop ();
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB",
_measure->proceed_time ().count (),
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER,
_measure->copy_bytes () * GIGABYTE_MAKER);
std::this_thread::yield ();
}
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB",
_measure->proceed_time ().count (),
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER,
_measure->copy_bytes ()* GIGABYTE_MAKER);
putchar ('\n');
}
int main (int argc, char* argv[])
{
measure (u8"CPU memcpy", new memcpy_performance ());
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM));
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT));
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM));
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT));
return 0;
}
@daramkun
Copy link
Author

daramkun commented Jul 3, 2019

CPU: AMD Ryzen 5 2600X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (Up to 2666MHz Overclocked)
M/B: MSI B350M Mortar
GPU: Sapphire AMD Radeon RX 480 8GB Nitro+

Result:

==== CPU memcpy Performance Measure ====
10.002s... 15.072728GB/s... Total Copied: 150.750000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 10.749963GB/s... Total Copied: 107.500000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 11.999961GB/s... Total Copied: 120.000000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 93.568324GB/s... Total Copied: 935.687500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 95.074754GB/s... Total Copied: 950.750000GB

Background Informations
2666MHz DDR4 Maximum Bandwidth : 21.33333GB/s.
PCI-Express 3.0 x16 Maximum Bandwidth: 15.75 GB/s.
GDDR5 2000MHz + 256-bit Memory bus Maximum Bandwidth: 256 GB/s.

@daramkun
Copy link
Author

daramkun commented Feb 7, 2020

CPU: AMD Ryzen 5 2600X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (Up to 2666MHz Overclocked)
M/B: MSI B350M Mortar
GPU: AMD Radeon RX 5700XT Reference Model by Sapphire

Result:

==== CPU memcpy Performance Measure ====
10.000s... 15.356055GB/s... Total Copied: 153.562500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 12.356203GB/s... Total Copied: 123.562500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 13.624967GB/s... Total Copied: 136.250000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 182.030647GB/s... Total Copied: 1820.312500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 186.499263GB/s... Total Copied: 1865.000000GB

Background Informations
GDDR6 1750MHz + 256-bit Memory bus Maximum Bandwidth: 448 GB/s

@daramkun
Copy link
Author

daramkun commented Feb 7, 2020

CPU: AMD Ryzen 5 1600X (No Overclocked)
RAM: DDR4 2133MHz 8GBx2 (No Overclocked)
M/B: ASRock B350M Pro4
GPU: Zotac NVIDIA GeForce GTX 970

Result:

==== CPU memcpy Performance Measure ====
10.000s... 13.245923GB/s... Total Copied: 132.562500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 12.181203GB/s... Total Copied: 121.812500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 13.474920GB/s... Total Copied: 134.750000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 127.330644GB/s... Total Copied: 1273.312500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 74.749695GB/s... Total Copied: 747.500000GB

@daramkun
Copy link
Author

daramkun commented Oct 10, 2020

CPU: AMD Ryzen 7 3700X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (No Overclocked, RAM Timing XMP)
M/B: MSI B550M Mortar WiFi
GPU: AMD Radeon RX 5700XT Reference Model by Sapphire

Result:

==== CPU memcpy Performance Measure ====
10.005s... 14.043436GB/s... Total Copied: 140.500000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 22.462423GB/s... Total Copied: 224.625000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 23.749940GB/s... Total Copied: 237.500000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 183.493069GB/s... Total Copied: 1834.937500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 187.524475GB/s... Total Copied: 1875.250000GB

Background Informations
PCI-Express 4.0 x16 Maximum Bandwidth: 31.5 GB/s.

@daramkun
Copy link
Author

daramkun commented Nov 6, 2020

CPU: AMD Ryzen 7 3700X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (No Overclocked, RAM Timing XMP)
M/B: MSI B550M Mortar WiFi
GPU: GALAX NVIDIA GeForce RTX 3070 EX OC

Result:

==== CPU memcpy Performance Measure ====
10.006s... 14.916766GB/s... Total Copied: 149.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 22.962413GB/s... Total Copied: 229.625000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 25.024848GB/s... Total Copied: 250.250000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 328.060400GB/s... Total Copied: 3280.625000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 174.724219GB/s... Total Copied: 1747.250000GB

@alexmercerind
Copy link

CPU: AMD Ryzen 3 2200U
RAM: Crucial DDR4 4GBx2
M/B: Lenovo ideapad 330S-15ARR (81FB)
GPU: AMD Radeon Vega 3 Mobile Graphics

Result:

==== CPU memcpy Performance Measure ====
10.007s... 5.233843GB/s... Total Copied: 52.375000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 6.624958GB/s... Total Copied: 66.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 7.474972GB/s... Total Copied: 74.750000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 10.406215GB/s... Total Copied: 104.062500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 8.499961GB/s... Total Copied: 85.000000GB

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment