Created January 29, 2020
Unity lockless (no GPU readback) marching cubes via Graphics.DrawProceduralIndirect - some slight faffing because compute shader must append full triangle (3 verts) at a time to render correctly, but this means the appendbuffer count is 3 times smaller than it needs to be, so we have to invoke a very short compute shader (FixupIndirectArgs) just…
// DrawProceduralIndirect
ComputeBuffer argsBuffer;
struct DrawCallArgBuffer
public const int size =
sizeof(int) +
sizeof(int) +
sizeof(int) +
public int vertexCountPerInstance;
public int instanceCount;
public int startVertexLocation;
public int startInstanceLocation;
void Fixup()
SIZE = N * N * N * 3 * 5;
//There are 8 threads run per group so N must be divisible by 8.
if (N % 8 != 0)
throw new System.ArgumentException("N must be divisible be 8");
//Holds the voxel values, generated from perlin noise.
m_noiseBuffer = new ComputeBuffer(N * N * N, sizeof(float));
//Holds the normals of the voxels.
// TODO: convert this to render textures, with bilinear sample in compute shader
m_normalsBuffer = new ComputeBuffer(N * N * N, sizeof(float) * 3);
//Holds the verts generated by the marching cubes.
m_meshBuffer = new ComputeBuffer(SIZE, sizeof(float) * 8 * 3, ComputeBufferType.Append);
//These two buffers are just some settings needed by the marching cubes.
m_cubeEdgeFlags = new ComputeBuffer(256, sizeof(int));
m_triangleConnectionTable = new ComputeBuffer(256 * 16, sizeof(int));
//Make the perlin noise, make sure to load resources to match shader used.
perlin = new ImprovedPerlinNoise(m_seed);
// Indirect args just stores the number of verts for the draw call
argsBuffer = new ComputeBuffer(1, DrawCallArgBuffer.size, ComputeBufferType.IndirectArguments);
int[] args = new int[] { 0, 1, 0, 0 };
void Update()
//Make the voxels.
m_perlinNoise.SetInt("_Width", N);
m_perlinNoise.SetInt("_Height", N);
m_perlinNoise.SetFloat("_Frequency", m_freq);
m_perlinNoise.SetFloat("_Lacunarity", m_lacunarity);
m_perlinNoise.SetFloat("_Gain", m_gain);
m_perlinNoise.SetFloat("_Time", Time.time * m_speed);
m_perlinNoise.SetTexture(0, "_PermTable1D", perlin.GetPermutationTable1D());
m_perlinNoise.SetTexture(0, "_PermTable2D", perlin.GetPermutationTable2D());
m_perlinNoise.SetTexture(0, "_Gradient4D", perlin.GetGradient4D());
m_perlinNoise.SetBuffer(0, "_Result", m_noiseBuffer);
m_perlinNoise.Dispatch(0, N / 8, N / 8, N / 8);
//Make the voxel normals.
m_normals.SetInt("_Width", N);
m_normals.SetInt("_Height", N);
m_normals.SetBuffer(0, "_Noise", m_noiseBuffer);
m_normals.SetBuffer(0, "_Result", m_normalsBuffer);
m_normals.Dispatch(0, N / 8, N / 8, N / 8);
//Make the mesh verts
m_marchingCubes.SetInt("_Width", N);
m_marchingCubes.SetInt("_Height", N);
m_marchingCubes.SetInt("_Depth", N);
m_marchingCubes.SetInt("_Border", 1);
m_marchingCubes.SetFloat("_Target", 0.0f);
m_marchingCubes.SetBuffer(0, "_Voxels", m_noiseBuffer);
m_marchingCubes.SetBuffer(0, "_Normals", m_normalsBuffer);
m_marchingCubes.SetBuffer(0, "_Buffer", m_meshBuffer);
m_marchingCubes.SetBuffer(0, "_CubeEdgeFlags", m_cubeEdgeFlags);
m_marchingCubes.SetBuffer(0, "_TriangleConnectionTable", m_triangleConnectionTable);
m_marchingCubes.SetBuffer(0, "DrawCallArgs", argsBuffer);
m_marchingCubes.Dispatch(0, N / 8, N / 8, N / 8);
// Copy generated count
ComputeBuffer.CopyCount(m_meshBuffer, argsBuffer, 0);
// Invoke very simple args fixup as generated count was triangles, not verts
m_fixupArgsCount.SetBuffer(0, "DrawCallArgs", argsBuffer);
m_fixupArgsCount.Dispatch(0, 1, 1, 1);
// Draw mesh using indirect args buffer
m_drawBuffer.SetBuffer("_Buffer", m_meshBuffer);
m_drawBuffer.SetMatrix("objMat", transform.localToWorldMatrix);
Graphics.DrawProceduralIndirect(m_drawBuffer, new Bounds(transform.position, transform.lossyScale),
MeshTopology.Triangles, argsBuffer, 0, null, null,
UnityEngine.Rendering.ShadowCastingMode.On, true);
struct Vert
float4 position;
float3 normal;
float dummy; // TODO: faster with or without this padding?
struct Triangle
Vert verts[3];
AppendStructuredBuffer<Triangle> _Buffer;
struct DrawCallArgsBuffer
uint vertexCountPerInstance;
uint instanceCount;
uint startVertexLocation;
uint startInstanceLocation;
RWStructuredBuffer <DrawCallArgsBuffer> DrawCallArgs;
[numthreads(8, 8, 8)]
void CSMain(int3 id : SV_DispatchThreadID)
//Dont generate verts at the edge as they dont have
//neighbours to make a cube from and the normal will
//not be correct around border.
if (id.x >= _Width - 1 - _Border) return;
if (id.y >= _Height - 1 - _Border) return;
if (id.z >= _Depth - 1 - _Border) return;
float3 pos = float3(id);
float3 centre = float3(_Width, _Height, _Depth) / 2.0;
float cube[8];
FillCube(id.x, id.y, id.z, cube);
int i = 0, j = 0;
int flagIndex = 0;
float3 edgeVertex[12];
//Find which vertices are inside of the surface and which are outside
for (i = 0; i < 8; i++)
if (cube[i] <= _Target) flagIndex |= 1 << i;
//Find which edges are intersected by the surface
int edgeFlags = _CubeEdgeFlags[flagIndex];
// no connections, return
if (edgeFlags == 0) return;
//Find the point of intersection of the surface with each edge
for (i = 0; i < 12; i++)
//if there is an intersection on this edge
if ((edgeFlags & (1 << i)) != 0)
float offset = GetOffset(cube[edgeConnection[i].x], cube[edgeConnection[i].y]);
edgeVertex[i] = pos + (vertexOffset[edgeConnection[i].x] + offset * edgeDirection[i]);
int idx = id.x + id.y * _Width + id.z * _Width * _Height;
//Save the triangles that were found. There can be up to five per cube
for (i = 0; i < 5; i++)
//If the connection table is not -1 then this a triangle.
if (_TriangleConnectionTable[flagIndex * 16 + 3 * i] >= 0)
Vert verts[3];
for (j = 0; j < 3; j++)
int v = _TriangleConnectionTable[flagIndex * 16 + (3 * i + j)];
float3 position = edgeVertex[v];
verts[j].position = float4(position - centre, 1.0) / (float)_Width;
verts[j].normal = SampleBilinear(_Normals, position);
//verts[j].normal = _Normals[idx];
Triangle tri = (Triangle)0;
tri.verts[0] = verts[0];
tri.verts[1] = verts[1];
tri.verts[2] = verts[2];
// Each #kernel tells which function to compile; you can have many kernels
#pragma kernel CSMain
struct DrawCallArgsBuffer
uint vertexCountPerInstance;
uint instanceCount;
uint startVertexLocation;
uint startInstanceLocation;
RWStructuredBuffer <DrawCallArgsBuffer> DrawCallArgs;
void CSMain (uint3 id : SV_DispatchThreadID)
DrawCallArgs[0].vertexCountPerInstance *= 3;
Copy link

DuncanF commented Jan 29, 2020

RE: the render shader - this is the relevant bit of code I use with a version of the Unity standard shader (where I've added some code hook points):

	struct Vertex
		float4 position;
		float3 normal;
		float dummy;

#ifdef SHADER_API_D3D11
	StructuredBuffer<Vertex> _Buffer;
	float4x4 objMat;

	void GetVertexData(in uint id,
		inout float4 position,
		inout half3 normal,
		inout float2 uv0,
		inout float2 uv1)
#ifdef SHADER_API_D3D11
		Vertex vert = _Buffer[id];
		position = vert.position;
		normal = vert.normal;

		position = mul(objMat, float4(, 1));
		normal = normalize(mul((float3x3)objMat, normal));

Then in the vert shader I'm calling something like: GetVertexData(, o.vertex, o.normal, o.uv0, o.uv1)

Copy link

@DuncanF in the vert shader, where does the "id" come from that you pass into GetVertexData()? I noticed you're not using the "_IdOffset" int anymore, like Keijiro's does. and I don't think "appdata_full" has the "id" info in it.

thanks for answering all my questions! I feel like I'm very close to getting this working!!

Copy link

smokelore commented Jan 29, 2020

same goes for "o.uv0" and "o.uv1", not sure where those are coming from.

I'm attempting to make a custom appdata struct to pass into the vert shader, I hope that's going in the right direction.

Copy link

smokelore commented Jan 29, 2020

I believe I have it working pretty well (took me a minute to realize that it was rendering much smaller than before!) but I'm seeing stray triangles frequently popping in and out of existence on the mesh (image attached). @DuncanF let me know if you've had experience with this bug!

Edit: clearer image

Copy link

smokelore commented Jan 29, 2020

I will say that the project is SIGNIFICANTLY faster now. Here's my profiler for each method (with VSync disabled).


This is very exciting!!

Copy link

DuncanF commented Jan 29, 2020

Nice one!
Re: stray triangles/ misplaced verts- I’m not sure but maybe check you’re appending full triangles (3x verts) in the marching cubes step, and that the indirect args fixup shader is deffo being run. It’s probably obvious when it doesn’t run as there’s only be 1/3rd the verts being rendered - so there wouldn’t be a full cube volume but only a 1/3rd slice

