Source code for "Optimal grid rendering is not optimal" article.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <d3d11.h> | |
#include <d3dcompiler.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#include <cassert> | |
#include <cmath> | |
#include <vector> | |
#pragma comment (lib, "d3d11.lib") | |
#pragma comment (lib, "d3dcompiler.lib") | |
#pragma comment (lib, "dxgi.lib") | |
namespace TomF | |
{ | |
const size_t max_cache_size = 32; | |
const float valence_boost_scale = 2.0f; | |
const float last_triangle_score = 0.75f; | |
// The table is for max_cache_size = 32, cache_decay_power = 1.5, | |
// Equation is ((position - 3) / (max_cache_size - 3)) ^ cache_decay_power | |
static const float cache_scores[max_cache_size] = | |
{ | |
last_triangle_score, last_triangle_score, last_triangle_score, | |
// perl -e "for ($i = 32; $i > 3; --$i) {print sqrt(($i - 3) / 29)*(($i-3)/29);print 'f, ';}" | |
1, 0.948724356160999f, 0.898356365398566f, 0.84891268884993f, 0.800410940418327f, | |
0.752869781209394f, 0.706309027617917f, 0.660749775712085f, 0.616214545218346f, | |
0.572727447267172f, 0.530314381193352f, 0.489003267201725f, 0.448824323769283f, | |
0.409810401494183f, 0.371997389083488f, 0.335424712859142f, 0.300135959460546f, | |
0.266179663821797f, 0.233610323536753f, 0.202489730867139f, 0.172888763130359f, | |
0.144889856948659f, 0.118590544520125f, 0.0941087226511742f, 0.0715909309083965f, | |
0.0512263001867729f, 0.0332724579777247f, 0.0181112321185824f, 0.00640328752334662f, | |
}; | |
struct Vertex | |
{ | |
Vertex(): cache_position(-1), score(0), triangles_total(0), triangles_not_added(0), triangles(0) | |
{ | |
} | |
int cache_position; | |
float score; | |
unsigned int triangles_total; | |
unsigned int triangles_not_added; | |
unsigned int* triangles; | |
}; | |
static float vertexScore(const Vertex& vertex) | |
{ | |
// check if there are any triangles needed | |
if (vertex.triangles_not_added == 0) return -1; | |
float score = vertex.cache_position == -1 ? 0 : cache_scores[vertex.cache_position]; | |
// Bonus points for having a low number of tris still to use the vert, so we get rid of lone verts quickly. | |
// return score + valence_boost_scale * powf(vertex.triangles_not_added, -valence_boost_power); | |
// for valence_boost_power = 0.5f | |
return score + valence_boost_scale / sqrtf((float)vertex.triangles_not_added); | |
} | |
void optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size) | |
{ | |
// support in-place optimization | |
std::vector<unsigned int> indices_copy; | |
if (destination == indices) | |
{ | |
indices_copy.assign(indices, indices + index_count); | |
indices = &indices_copy[0]; | |
} | |
// clamp cache size to the size of LUT | |
if (cache_size > max_cache_size) | |
cache_size = max_cache_size; | |
const float m_inf = -1e32f; | |
assert(index_count % 3 == 0); | |
size_t face_count = index_count / 3; | |
std::vector<Vertex> vertices(vertex_count); | |
// initializing vertex data | |
for (size_t i = 0; i < index_count; ++i) | |
{ | |
vertices[indices[i]].triangles_total++; | |
} | |
// calculate total triangle number | |
std::vector<unsigned int> triangle_indices(index_count); // total triangle indices count == index count | |
unsigned int* triangle_indices_ptr = &triangle_indices[0]; | |
// allocate storage for triangle indices | |
for (size_t i = 0; i < vertex_count; ++i) | |
{ | |
Vertex& v = vertices[i]; | |
v.triangles = triangle_indices_ptr; | |
triangle_indices_ptr += v.triangles_total; | |
} | |
// fill triangle indices | |
for (size_t i = 0; i < face_count; ++i) | |
{ | |
Vertex& a = vertices[indices[i * 3 + 0]]; | |
a.triangles[a.triangles_not_added++] = static_cast<unsigned int>(i); | |
Vertex& b = vertices[indices[i * 3 + 1]]; | |
b.triangles[b.triangles_not_added++] = static_cast<unsigned int>(i); | |
Vertex& c = vertices[indices[i * 3 + 2]]; | |
c.triangles[c.triangles_not_added++] = static_cast<unsigned int>(i); | |
} | |
// compute initial vertex scores | |
for (size_t i = 0; i < vertex_count; ++i) | |
{ | |
Vertex& v = vertices[i]; | |
v.score = vertexScore(v); | |
} | |
// compute triangle scores | |
std::vector<float> triangle_scores(face_count); | |
for (size_t i = 0; i < face_count; ++i) | |
{ | |
unsigned int tri_a = indices[i * 3 + 0]; | |
unsigned int tri_b = indices[i * 3 + 1]; | |
unsigned int tri_c = indices[i * 3 + 2]; | |
triangle_scores[i] = vertices[tri_a].score + vertices[tri_b].score + vertices[tri_c].score; | |
} | |
unsigned int* destination_end = destination + index_count; | |
unsigned int cache_holder[2 * (max_cache_size + 3)]; | |
unsigned int* cache = cache_holder; | |
unsigned int* cache_end = cache; | |
unsigned int* cache_new = cache + max_cache_size + 3; | |
unsigned int* cache_new_end = cache_new; | |
int min_tri = -1; | |
// main body | |
while (destination != destination_end) | |
{ | |
// find triangle with the best score if it was not found on previous step | |
if (min_tri < 0) | |
{ | |
float min_score = m_inf; | |
for (size_t i = 0; i < face_count; ++i) | |
{ | |
float tri_score = triangle_scores[i]; | |
if (min_score < tri_score) | |
{ | |
min_tri = int(i); | |
min_score = tri_score; | |
} | |
} | |
} | |
unsigned int tri_a = indices[min_tri * 3 + 0]; | |
unsigned int tri_b = indices[min_tri * 3 + 1]; | |
unsigned int tri_c = indices[min_tri * 3 + 2]; | |
// add it to drawing sequence | |
*destination++ = tri_a; | |
*destination++ = tri_b; | |
*destination++ = tri_c; | |
triangle_scores[min_tri] = m_inf; | |
// construct new cache | |
cache_new_end = cache_new; | |
// new triangle | |
*cache_new_end++ = tri_a; | |
*cache_new_end++ = tri_b; | |
*cache_new_end++ = tri_c; | |
// old parts | |
int cp_a = vertices[tri_a].cache_position; | |
int cp_b = vertices[tri_b].cache_position; | |
int cp_c = vertices[tri_c].cache_position; | |
// order indices | |
if (cp_a > cp_b) std::swap(cp_a, cp_b); | |
if (cp_a > cp_c) std::swap(cp_a, cp_c); | |
if (cp_b > cp_c) std::swap(cp_b, cp_c); | |
if (cp_c == -1) // largest is -1 => all 3 are -1 | |
{ | |
cache_new_end = std::copy(cache, cache_end, cache_new_end); | |
} | |
else if (cp_b == -1) // cp_a == -1, cp_b == -1 | |
{ | |
cache_new_end = std::copy(cache, cache + cp_c, cache_new_end); | |
cache_new_end = std::copy(cache + cp_c + 1, cache_end, cache_new_end); | |
} | |
else if (cp_a == -1) // only cp_a == -1 | |
{ | |
cache_new_end = std::copy(cache, cache + cp_b, cache_new_end); | |
cache_new_end = std::copy(cache + cp_b + 1, cache + cp_c, cache_new_end); | |
cache_new_end = std::copy(cache + cp_c + 1, cache_end, cache_new_end); | |
} | |
else | |
{ | |
cache_new_end = std::copy(cache, cache + cp_a, cache_new_end); | |
cache_new_end = std::copy(cache + cp_a + 1, cache + cp_b, cache_new_end); | |
cache_new_end = std::copy(cache + cp_b + 1, cache + cp_c, cache_new_end); | |
cache_new_end = std::copy(cache + cp_c + 1, cache_end, cache_new_end); | |
} | |
size_t cache_new_size = cache_new_end - cache_new; | |
if (cache_new_size > cache_size) cache_new_end = cache_new + cache_size; | |
std::swap(cache, cache_new); | |
std::swap(cache_end, cache_new_end); | |
// modify vertices | |
vertices[tri_a].triangles_not_added--; | |
vertices[tri_b].triangles_not_added--; | |
vertices[tri_c].triangles_not_added--; | |
min_tri = -1; | |
float min_score = m_inf; | |
// update cache positions, vertices scores and triangle scores, and find new min_face | |
for (size_t i = 0; i < cache_new_size; ++i) | |
{ | |
Vertex& v = vertices[cache[i]]; | |
v.cache_position = i >= cache_size ? -1 : (int)i; | |
float score = vertexScore(v); | |
float score_diff = score - v.score; | |
// update scores of vertex triangles | |
for (size_t j = 0; j < v.triangles_total; ++j) | |
{ | |
unsigned int tri = v.triangles[j]; | |
triangle_scores[tri] += score_diff; | |
float tri_score = triangle_scores[tri]; | |
if (min_score < tri_score) | |
{ | |
min_tri = tri; | |
min_score = tri_score; | |
} | |
} | |
v.score = score; | |
} | |
} | |
} | |
} | |
namespace Tipsify | |
{ | |
struct Adjacency | |
{ | |
std::vector<unsigned int> triangle_counts; | |
std::vector<unsigned int> offsets; | |
std::vector<unsigned int> data; | |
}; | |
static void buildAdjacency(Adjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count) | |
{ | |
// fill triangle counts | |
adjacency.triangle_counts.resize(vertex_count, 0); | |
for (size_t i = 0; i < index_count; ++i) | |
{ | |
assert(indices[i] < vertex_count); | |
adjacency.triangle_counts[indices[i]]++; | |
} | |
// fill offset table | |
adjacency.offsets.resize(vertex_count); | |
unsigned int offset = 0; | |
for (size_t i = 0; i < vertex_count; ++i) | |
{ | |
adjacency.offsets[i] = offset; | |
offset += adjacency.triangle_counts[i]; | |
} | |
// fill triangle data | |
adjacency.data.resize(offset); | |
std::vector<unsigned int> offsets = adjacency.offsets; | |
size_t face_count = index_count / 3; | |
for (size_t i = 0; i < face_count; ++i) | |
{ | |
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; | |
adjacency.data[offsets[a]++] = unsigned(i); | |
adjacency.data[offsets[b]++] = unsigned(i); | |
adjacency.data[offsets[c]++] = unsigned(i); | |
} | |
} | |
static unsigned int getNextVertexDeadEnd(const std::vector<unsigned int>& dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const std::vector<unsigned int>& live_triangles) | |
{ | |
// check dead-end stack | |
while (dead_end_top) | |
{ | |
unsigned int vertex = dead_end[--dead_end_top]; | |
if (live_triangles[vertex] > 0) | |
return vertex; | |
} | |
// input order | |
while (input_cursor < live_triangles.size()) | |
{ | |
if (live_triangles[input_cursor] > 0) | |
return input_cursor; | |
++input_cursor; | |
} | |
return static_cast<unsigned int>(-1); | |
} | |
static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const std::vector<unsigned int>& live_triangles, const std::vector<unsigned int>& cache_timestamps, unsigned int timestamp, unsigned int cache_size) | |
{ | |
unsigned int best_candidate = static_cast<unsigned int>(-1); | |
int best_priority = -1; | |
for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate) | |
{ | |
unsigned int vertex = *next_candidate; | |
// otherwise we don't need to process it | |
if (live_triangles[vertex] > 0) | |
{ | |
int priority = 0; | |
// will it be in cache after fanning? | |
if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size) | |
{ | |
priority = timestamp - cache_timestamps[vertex]; // position in cache | |
} | |
if (priority > best_priority) | |
{ | |
best_candidate = vertex; | |
best_priority = priority; | |
} | |
} | |
} | |
return best_candidate; | |
} | |
void optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size) | |
{ | |
assert(index_count % 3 == 0); | |
assert(cache_size >= 3); | |
// guard for empty meshes | |
if (index_count == 0 || vertex_count == 0) | |
{ | |
return; | |
} | |
// support in-place optimization | |
std::vector<unsigned int> indices_copy; | |
if (destination == indices) | |
{ | |
indices_copy.assign(indices, indices + index_count); | |
indices = &indices_copy[0]; | |
} | |
// build adjacency information | |
Adjacency adjacency; | |
buildAdjacency(adjacency, indices, index_count, vertex_count); | |
// live triangle counts | |
std::vector<unsigned int> live_triangles = adjacency.triangle_counts; | |
// cache time stamps | |
std::vector<unsigned int> cache_timestamps(vertex_count, 0); | |
// dead-end stack | |
std::vector<unsigned int> dead_end(index_count); | |
unsigned int dead_end_top = 0; | |
// emitted flags | |
std::vector<char> emitted_flags(index_count / 3, false); | |
unsigned int current_vertex = 0; | |
unsigned int timestamp = cache_size + 1; | |
unsigned int input_cursor = 1; // vertex to restart from in case of dead-end | |
unsigned int output_triangle = 0; | |
while (current_vertex != static_cast<unsigned int>(-1)) | |
{ | |
const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top; | |
// emit all vertex neighbours | |
const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; | |
const unsigned int* neighbours_end = neighbours_begin + adjacency.triangle_counts[current_vertex]; | |
for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) | |
{ | |
unsigned int triangle = *it; | |
if (!emitted_flags[triangle]) | |
{ | |
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; | |
// output indices | |
destination[output_triangle * 3 + 0] = a; | |
destination[output_triangle * 3 + 1] = b; | |
destination[output_triangle * 3 + 2] = c; | |
output_triangle++; | |
// update dead-end stack | |
dead_end[dead_end_top + 0] = a; | |
dead_end[dead_end_top + 1] = b; | |
dead_end[dead_end_top + 2] = c; | |
dead_end_top += 3; | |
// update live triangle counts | |
live_triangles[a]--; | |
live_triangles[b]--; | |
live_triangles[c]--; | |
// update cache info | |
// if vertex is not in cache, put it in cache | |
if (timestamp - cache_timestamps[a] > cache_size) | |
cache_timestamps[a] = timestamp++; | |
if (timestamp - cache_timestamps[b] > cache_size) | |
cache_timestamps[b] = timestamp++; | |
if (timestamp - cache_timestamps[c] > cache_size) | |
cache_timestamps[c] = timestamp++; | |
// update emitted flags | |
emitted_flags[triangle] = true; | |
} | |
} | |
// next candidates are the ones we pushed to dead-end stack just now | |
const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top; | |
// get next vertex | |
current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, live_triangles, cache_timestamps, timestamp, cache_size); | |
if (current_vertex == static_cast<unsigned int>(-1)) | |
{ | |
current_vertex = getNextVertexDeadEnd(dead_end, dead_end_top, input_cursor, live_triangles); | |
} | |
} | |
assert(output_triangle == index_count / 3); | |
} | |
} | |
void gridGen(std::vector<unsigned int>& indices, int x0, int x1, int y0, int y1, int width, int cacheSize, bool prefetch) | |
{ | |
if (x1 - x0 + 1 < cacheSize) | |
{ | |
if (2 * (x1 - x0) + 1 > cacheSize && prefetch) | |
{ | |
for (int x = x0; x < x1; x++) | |
{ | |
indices.push_back(x + 0); | |
indices.push_back(x + 0); | |
indices.push_back(x + 1); | |
} | |
} | |
for (int y = y0; y < y1; y++) | |
{ | |
for (int x = x0; x < x1; x++) | |
{ | |
indices.push_back((width + 1) * (y + 0) + (x + 0)); | |
indices.push_back((width + 1) * (y + 1) + (x + 0)); | |
indices.push_back((width + 1) * (y + 0) + (x + 1)); | |
indices.push_back((width + 1) * (y + 0) + (x + 1)); | |
indices.push_back((width + 1) * (y + 1) + (x + 0)); | |
indices.push_back((width + 1) * (y + 1) + (x + 1)); | |
} | |
} | |
} | |
else | |
{ | |
int xm = x0 + cacheSize - 2; | |
gridGen(indices, x0, xm, y0, y1, width, cacheSize, prefetch); | |
gridGen(indices, xm, x1, y0, y1, width, cacheSize, prefetch); | |
} | |
} | |
unsigned int queryVSInvocations(ID3D11Device* device, ID3D11DeviceContext* context, const unsigned int* indices, size_t index_count) | |
{ | |
ID3D11Buffer* ib = 0; | |
{ | |
D3D11_BUFFER_DESC bd = {}; | |
bd.Usage = D3D11_USAGE_DYNAMIC; | |
bd.ByteWidth = index_count * 4; | |
bd.BindFlags = D3D11_BIND_INDEX_BUFFER; | |
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; | |
device->CreateBuffer(&bd, 0, &ib); | |
D3D11_MAPPED_SUBRESOURCE ms; | |
context->Map(ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &ms); | |
memcpy(ms.pData, indices, index_count * 4); | |
context->Unmap(ib, 0); | |
} | |
context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); | |
context->IASetIndexBuffer(ib, DXGI_FORMAT_R32_UINT, 0); | |
D3D11_QUERY_DESC qdesc = { D3D11_QUERY_PIPELINE_STATISTICS }; | |
ID3D11Query* query = 0; | |
device->CreateQuery(&qdesc, &query); | |
context->Begin(query); | |
context->DrawIndexed(index_count, 0, 0); | |
context->End(query); | |
D3D11_QUERY_DATA_PIPELINE_STATISTICS stats = {}; | |
while (S_FALSE == context->GetData(query, &stats, sizeof(stats), 0)) | |
; | |
query->Release(); | |
ib->Release(); | |
assert(stats.IAVertices == index_count); | |
return stats.VSInvocations; | |
} | |
unsigned int estimateVSInvocationsFIFO(const unsigned int* indices, size_t index_count, size_t cache_size) | |
{ | |
std::vector<unsigned int> cache; | |
unsigned int invocations = 0; | |
for (size_t i = 0; i < index_count; ++i) | |
{ | |
unsigned int index = indices[i]; | |
bool found = false; | |
for (size_t j = 0; j < cache.size(); ++j) | |
{ | |
if (cache[j] == index) | |
{ | |
found = true; | |
break; | |
} | |
} | |
if (!found) | |
invocations++; | |
if (!found) | |
cache.push_back(index); | |
if (cache.size() > cache_size) | |
cache.erase(cache.begin(), cache.end() - cache_size); | |
} | |
return invocations; | |
} | |
unsigned int estimateVSInvocationsLRU(const unsigned int* indices, size_t index_count, size_t cache_size) | |
{ | |
std::vector<unsigned int> cache; | |
unsigned int invocations = 0; | |
for (size_t i = 0; i < index_count; ++i) | |
{ | |
unsigned int index = indices[i]; | |
bool found = false; | |
for (size_t j = 0; j < cache.size(); ++j) | |
{ | |
if (cache[j] == index) | |
{ | |
found = true; | |
cache.erase(cache.begin() + j); | |
break; | |
} | |
} | |
if (!found) | |
invocations++; | |
cache.push_back(index); | |
if (cache.size() > cache_size) | |
cache.erase(cache.begin(), cache.end() - cache_size); | |
} | |
return invocations; | |
} | |
unsigned int estimateVSInvocationsReuse(const unsigned int* indices, size_t index_count, size_t warp_size, size_t fifo_size, size_t triangle_limit) | |
{ | |
std::vector<unsigned int> warp; | |
unsigned int invocations = 0; | |
unsigned int triangles = 0; | |
for (size_t i = 0; i < index_count; i += 3) | |
{ | |
bool found[3] = {}; | |
for (size_t k = 0; k < 3; ++k) | |
{ | |
for (size_t j = 0; j < warp.size(); ++j) | |
{ | |
if (warp.size() - j > fifo_size) | |
continue; | |
if (warp[j] == indices[i + k]) | |
{ | |
found[k] = true; | |
break; | |
} | |
} | |
} | |
// triangle doesn't fit into warp we're currently building - flush | |
if (warp.size() + !found[0] + !found[1] + !found[2] > warp_size || triangles >= triangle_limit) | |
{ | |
invocations += warp.size(); | |
warp.clear(); | |
// restart warp from scratch | |
warp.push_back(indices[i + 0]); | |
warp.push_back(indices[i + 1]); | |
warp.push_back(indices[i + 2]); | |
triangles = 1; | |
} | |
else | |
{ | |
if (!found[0]) | |
warp.push_back(indices[i + 0]); | |
if (!found[1]) | |
warp.push_back(indices[i + 1]); | |
if (!found[2]) | |
warp.push_back(indices[i + 2]); | |
triangles++; | |
} | |
} | |
// last warp | |
invocations += warp.size(); | |
return invocations; | |
} | |
void setupShaders(ID3D11Device* device, ID3D11DeviceContext* context) | |
{ | |
// load and compile the two shaders | |
const char* shaders = | |
"#define ATTRIBUTES 5\n" | |
"struct Foo { float4 v[ATTRIBUTES]; };" | |
"float4 VS(uint index: SV_VertexId, out Foo foo: FOO): SV_Position { uint i = index % 3; [unroll] for (int j = 0; j < ATTRIBUTES; j++) foo.v[j] = j; return float4(i != 0, i != 2, 0, 1); }" | |
"float4 PS(Foo foo: FOO): SV_Target { float4 result = 0; [unroll] for (int j = 0; j < ATTRIBUTES; j++) result += foo.v[j]; return result; }"; | |
ID3DBlob* vsblob = 0; | |
ID3DBlob* psblob = 0; | |
D3DCompile(shaders, strlen(shaders), 0, 0, 0, "VS", "vs_5_0", 0, 0, &vsblob, 0); | |
D3DCompile(shaders, strlen(shaders), 0, 0, 0, "PS", "ps_5_0", 0, 0, &psblob, 0); | |
ID3D11VertexShader* vs = 0; | |
ID3D11PixelShader* ps = 0; | |
device->CreateVertexShader(vsblob->GetBufferPointer(), vsblob->GetBufferSize(), 0, &vs); | |
device->CreatePixelShader(psblob->GetBufferPointer(), psblob->GetBufferSize(), 0, &ps); | |
context->VSSetShader(vs, 0, 0); | |
context->PSSetShader(ps, 0, 0); | |
} | |
template <typename Cache> | |
void inspectCache(Cache cache) | |
{ | |
unsigned int max_cache_size = 200; | |
unsigned int grid_size = 100; | |
for (unsigned int cache_size = 3; cache_size <= max_cache_size; cache_size += 1) | |
{ | |
std::vector<unsigned int> grid1; | |
gridGen(grid1, 0, grid_size, 0, grid_size, grid_size, cache_size, true); | |
std::vector<unsigned int> grid2; | |
gridGen(grid2, 0, grid_size, 0, grid_size, grid_size, cache_size, false); | |
std::vector<unsigned int> grid3; | |
gridGen(grid3, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles | |
Tipsify::optimizeVertexCache(&grid3[0], &grid3[0], grid3.size(), (grid_size + 1) * (grid_size + 1), cache_size); | |
std::vector<unsigned int> grid4; | |
gridGen(grid4, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles | |
TomF::optimizeVertexCache(&grid4[0], &grid4[0], grid4.size(), (grid_size + 1) * (grid_size + 1), cache_size); | |
unsigned int invocations1 = cache(&grid1[0], grid1.size()); | |
unsigned int invocations2 = cache(&grid2[0], grid2.size()); | |
unsigned int invocations3 = cache(&grid3[0], grid3.size()); | |
unsigned int invocations4 = cache(&grid4[0], grid4.size()); | |
unsigned int ideal_invocations = (grid_size + 1) * (grid_size + 1); | |
printf("%d, %f, %f, %f, %f\n", cache_size, | |
double(invocations1) / double(ideal_invocations), | |
double(invocations2) / double(ideal_invocations), | |
double(invocations3) / double(ideal_invocations), | |
double(invocations4) / double(ideal_invocations)); | |
} | |
} | |
void testCache(IDXGIAdapter* adapter) | |
{ | |
ID3D11Device* device = 0; | |
ID3D11DeviceContext* context = 0; | |
D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context); | |
setupShaders(device, context); | |
inspectCache([&](const unsigned int* indices, size_t index_count) { return queryVSInvocations(device, context, indices, index_count); }); | |
unsigned int dib[] = { 0, 1, 1, 2, 3, 4, 5, 5, 5 }; | |
printf("// Degenerate IB invocations: %d\n", queryVSInvocations(device, context, dib, sizeof(dib) / sizeof(dib[0]))); | |
printf("// Reuse boundaries: "); | |
unsigned int last = 0; | |
for (unsigned int count = 3; count < 1000; count += 3) | |
{ | |
std::vector<unsigned int> ib; | |
for (unsigned int i = 0; i < count; i += 3) | |
{ | |
ib.push_back(0); | |
ib.push_back(1); | |
ib.push_back(2); | |
} | |
unsigned int invocations = queryVSInvocations(device, context, ib.data(), ib.size()); | |
if (invocations != last) | |
{ | |
printf("%d, ", count); | |
last = invocations; | |
} | |
} | |
printf("\n"); | |
} | |
void modelCacheFIFO(size_t model_cache_size) | |
{ | |
inspectCache([&](const unsigned int* indices, size_t index_count) { return estimateVSInvocationsFIFO(indices, index_count, model_cache_size); }); | |
} | |
void modelCacheLRU(size_t model_cache_size) | |
{ | |
inspectCache([&](const unsigned int* indices, size_t index_count) { return estimateVSInvocationsLRU(indices, index_count, model_cache_size); }); | |
} | |
void modelCacheReuse(size_t model_warp_size, size_t model_fifo_size, size_t model_triangle_limit) | |
{ | |
inspectCache([&](const unsigned int* indices, size_t index_count) { return estimateVSInvocationsReuse(indices, index_count, model_warp_size, model_fifo_size, model_triangle_limit); }); | |
} | |
int main(int argc, char** argv) | |
{ | |
IDXGIFactory1* factory = 0; | |
CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&factory); | |
IDXGIAdapter* adapter = NULL; | |
for (unsigned int index = 0; SUCCEEDED(factory->EnumAdapters(index, &adapter)); ++index) | |
{ | |
DXGI_ADAPTER_DESC ad = {}; | |
adapter->GetDesc(&ad); | |
printf("// GPU %d: %S\n", index, ad.Description); | |
testCache(adapter); | |
} | |
printf("// Model FIFO 128\n"); | |
modelCacheFIFO(128); | |
printf("// Model LRU 16\n"); | |
modelCacheLRU(16); | |
printf("// Model Reuse 32\n"); | |
modelCacheReuse(32, 32, 32); | |
printf("// Model Reuse 32 w/16-entry FIFO\n"); | |
modelCacheReuse(32, 16, 32); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment