Skip to content

Instantly share code, notes, and snippets.

@rossy
Created August 27, 2019 10:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rossy/78a6edec8ead195d9a473d40bca9055d to your computer and use it in GitHub Desktop.
Save rossy/78a6edec8ead195d9a473d40bca9055d to your computer and use it in GitHub Desktop.
#version 450
#extension GL_ARB_compute_shader : enable
#extension GL_ARB_shader_image_load_store : enable
#define tex1D texture
#define tex3D texture
#define LUT_POS(x, lut_size) mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))
layout(std140, binding=0) uniform UBO {
layout(offset=0) vec2 texture_size0;
layout(offset=16) mat2 texture_rot0;
layout(offset=48) vec2 texture_off0;
layout(offset=56) vec2 pixel_size0;
layout(offset=64) vec2 out_scale;
layout(offset=72) vec2 tex_scale0;
};
layout(binding=0, rgba16f) uniform writeonly image2D out_image;
layout(binding=0) uniform sampler2D texture0;
layout (local_size_x = 1024, local_size_y = 1) in;
#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))
#define texmap0_raw(id) (tex_scale0 * outcoord(id))
#define texmap0(id) (texture_rot0 * texmap0_raw(id) + pixel_size0 * texture_off0)
#define texcoord0 texmap0(gl_GlobalInvocationID)
#define gl_NumWorkGroups uvec3(1, 1, 1)
shared uint err_rgb8[6492];
void main() {
vec4 color = vec4(0.0, 0.0, 0.0, 1.0);
for (int i = int(gl_LocalInvocationIndex); i < 6492; i += 1024) err_rgb8[i] = 0;
for (int block_id = 0; block_id < 5440; ++block_id) {
groupMemoryBarrier();
barrier();
int id = int(gl_LocalInvocationIndex) + block_id * 1024;
int y = id % 1080, x_shifted = id / 1080;
int x = x_shifted - y * 3;
if (0 <= x && x < 1920) {
int idx = (x_shifted * 1082 + y) % 6492;
vec3 pix = texelFetch(texture0, ivec2(x, y), 0).rgb;
uint err_u32 = err_rgb8[idx] + 2148008064u;
pix = pix * 255.0 + vec3(int((err_u32 >> 24) & 255u) - 128,int((err_u32 >> 12) & 255u) - 128,int( err_u32 & 255u) - 128) / 254.0;
err_rgb8[idx] = 0;
vec3 dithered = round(pix);
imageStore(out_image, ivec2(x, y), vec4(dithered / 255.0, 0.0));
vec3 err_divided = (pix - dithered) * 254.0 / 32.0;
ivec3 tmp;
tmp = ivec3(round(err_divided * 2.0));
err_u32 = (uint(tmp.r & 255) << 24)|(uint(tmp.g & 255) << 12)| uint(tmp.b & 255);
if (x >= 2) atomicAdd(err_rgb8[(idx + 1083) % 6492], err_u32);
atomicAdd(err_rgb8[(idx + 5411) % 6492], err_u32);
tmp = ivec3(round(err_divided * 4.0));
err_u32 = (uint(tmp.r & 255) << 24)|(uint(tmp.g & 255) << 12)| uint(tmp.b & 255);
atomicAdd(err_rgb8[(idx + 2164) % 6492], err_u32);
if (x >= 1) atomicAdd(err_rgb8[(idx + 2165) % 6492], err_u32);
atomicAdd(err_rgb8[(idx + 4329) % 6492], err_u32);
tmp = ivec3(round(err_divided * 8.0));
err_u32 = (uint(tmp.r & 255) << 24)|(uint(tmp.g & 255) << 12)| uint(tmp.b & 255);
atomicAdd(err_rgb8[(idx + 1082) % 6492], err_u32);
atomicAdd(err_rgb8[(idx + 3247) % 6492], err_u32);
}
}
}
; SPIR-V
; Version: 1.0
; Generator: Google Shaderc over Glslang; 7
; Bound: 323
; Schema: 0
OpCapability Shader
%2 = OpExtInstImport "GLSL.std.450"
OpMemoryModel Logical GLSL450
OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex
OpExecutionMode %main LocalSize 1024 1 1
%1 = OpString "dither.comp"
OpSource GLSL 450 %1 "// OpModuleProcessed entry-point main
// OpModuleProcessed client vulkan100
// OpModuleProcessed target-env vulkan1.0
// OpModuleProcessed entry-point main
#line 1
#version 450
#extension GL_ARB_compute_shader : enable
#extension GL_ARB_shader_image_load_store : enable
#define tex1D texture
#define tex3D texture
#define LUT_POS(x, lut_size) mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))
layout(std140, binding=0) uniform UBO {
layout(offset=0) vec2 texture_size0;
layout(offset=16) mat2 texture_rot0;
layout(offset=48) vec2 texture_off0;
layout(offset=56) vec2 pixel_size0;
layout(offset=64) vec2 out_scale;
layout(offset=72) vec2 tex_scale0;
};
layout(binding=0, rgba16f) uniform writeonly image2D out_image;
layout(binding=0) uniform sampler2D texture0;
layout (local_size_x = 1024, local_size_y = 1) in;
#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))
#define texmap0_raw(id) (tex_scale0 * outcoord(id))
#define texmap0(id) (texture_rot0 * texmap0_raw(id) + pixel_size0 * texture_off0)
#define texcoord0 texmap0(gl_GlobalInvocationID)
#define gl_NumWorkGroups uvec3(1, 1, 1)
shared uint err_rgb8[6492];
void main() {
vec4 color = vec4(0.0, 0.0, 0.0, 1.0);
for (int i = int(gl_LocalInvocationIndex); i < 6492; i += 1024) err_rgb8[i] = 0;
for (int block_id = 0; block_id < 5440; ++block_id) {
groupMemoryBarrier();
barrier();
int id = int(gl_LocalInvocationIndex) + block_id * 1024;
int y = id % 1080, x_shifted = id / 1080;
int x = x_shifted - y * 3;
if (0 <= x && x < 1920) {
int idx = (x_shifted * 1082 + y) % 6492;
vec3 pix = texelFetch(texture0, ivec2(x, y), 0).rgb;
uint err_u32 = err_rgb8[idx] + 2148008064u;
pix = pix * 255.0 + vec3(int((err_u32 >> 24) & 255u) - 128,int((err_u32 >> 12) & 255u) - 128,int( err_u32 & 255u) - 128) / 254.0;
err_rgb8[idx] = 0;
vec3 dithered = round(pix);
imageStore(out_image, ivec2(x, y), vec4(dithered / 255.0, 0.0));
vec3 err_divided = (pix - dithered) * 254.0 / 32.0;
ivec3 tmp;
tmp = ivec3(round(err_divided * 2.0));
err_u32 = (uint(tmp.r & 255) << 24)|(uint(tmp.g & 255) << 12)| uint(tmp.b & 255);
if (x >= 2) atomicAdd(err_rgb8[(idx + 1083) % 6492], err_u32);
atomicAdd(err_rgb8[(idx + 5411) % 6492], err_u32);
tmp = ivec3(round(err_divided * 4.0));
err_u32 = (uint(tmp.r & 255) << 24)|(uint(tmp.g & 255) << 12)| uint(tmp.b & 255);
atomicAdd(err_rgb8[(idx + 2164) % 6492], err_u32);
if (x >= 1) atomicAdd(err_rgb8[(idx + 2165) % 6492], err_u32);
atomicAdd(err_rgb8[(idx + 4329) % 6492], err_u32);
tmp = ivec3(round(err_divided * 8.0));
err_u32 = (uint(tmp.r & 255) << 24)|(uint(tmp.g & 255) << 12)| uint(tmp.b & 255);
atomicAdd(err_rgb8[(idx + 1082) % 6492], err_u32);
atomicAdd(err_rgb8[(idx + 3247) % 6492], err_u32);
}
}
}
"
OpSourceExtension "GL_ARB_compute_shader"
OpSourceExtension "GL_ARB_shader_image_load_store"
OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
OpSourceExtension "GL_GOOGLE_include_directive"
OpName %main "main"
OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
OpName %err_rgb8 "err_rgb8"
OpName %texture0 "texture0"
OpName %out_image "out_image"
OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
OpDecorate %texture0 DescriptorSet 0
OpDecorate %texture0 Binding 0
OpDecorate %out_image DescriptorSet 0
OpDecorate %out_image Binding 0
OpDecorate %out_image NonReadable
OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
%void = OpTypeVoid
%4 = OpTypeFunction %void
%float = OpTypeFloat 32
%v4float = OpTypeVector %float 4
%float_0 = OpConstant %float 0
%int = OpTypeInt 32 1
%uint = OpTypeInt 32 0
%_ptr_Input_uint = OpTypePointer Input %uint
%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
%int_6492 = OpConstant %int 6492
%bool = OpTypeBool
%uint_6492 = OpConstant %uint 6492
%_arr_uint_uint_6492 = OpTypeArray %uint %uint_6492
%_ptr_Workgroup__arr_uint_uint_6492 = OpTypePointer Workgroup %_arr_uint_uint_6492
%err_rgb8 = OpVariable %_ptr_Workgroup__arr_uint_uint_6492 Workgroup
%uint_0 = OpConstant %uint 0
%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint
%int_1024 = OpConstant %int 1024
%int_0 = OpConstant %int 0
%int_5440 = OpConstant %int 5440
%uint_2 = OpConstant %uint 2
%uint_3400 = OpConstant %uint 3400
%uint_264 = OpConstant %uint 264
%int_1080 = OpConstant %int 1080
%int_3 = OpConstant %int 3
%int_1920 = OpConstant %int 1920
%int_1082 = OpConstant %int 1082
%v3float = OpTypeVector %float 3
%92 = OpTypeImage %float 2D 0 0 0 1 Unknown
%93 = OpTypeSampledImage %92
%_ptr_UniformConstant_93 = OpTypePointer UniformConstant %93
%texture0 = OpVariable %_ptr_UniformConstant_93 UniformConstant
%v2int = OpTypeVector %int 2
%uint_2148008064 = OpConstant %uint 2148008064
%float_255 = OpConstant %float 255
%int_24 = OpConstant %int 24
%uint_255 = OpConstant %uint 255
%int_128 = OpConstant %int 128
%int_12 = OpConstant %int 12
%float_254 = OpConstant %float 254
%145 = OpTypeImage %float 2D 0 0 0 2 Rgba16f
%_ptr_UniformConstant_145 = OpTypePointer UniformConstant %145
%out_image = OpVariable %_ptr_UniformConstant_145 UniformConstant
%v3int = OpTypeVector %int 3
%float_2 = OpConstant %float 2
%int_255 = OpConstant %int 255
%uint_1 = OpConstant %uint 1
%int_2 = OpConstant %int 2
%int_1083 = OpConstant %int 1083
%int_5411 = OpConstant %int 5411
%float_4 = OpConstant %float 4
%int_2164 = OpConstant %int 2164
%int_1 = OpConstant %int 1
%int_2165 = OpConstant %int 2165
%int_4329 = OpConstant %int 4329
%float_8 = OpConstant %float 8
%int_3247 = OpConstant %int 3247
%v3uint = OpTypeVector %uint 3
%uint_1024 = OpConstant %uint 1024
%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1024 %uint_1 %uint_1
%float_0_00393700786 = OpConstant %float 0.00393700786
%314 = OpConstantComposite %v3float %float_0_00393700786 %float_0_00393700786 %float_0_00393700786
%float_0_00392156886 = OpConstant %float 0.00392156886
%317 = OpConstantComposite %v3float %float_0_00392156886 %float_0_00392156886 %float_0_00392156886
%float_0_03125 = OpConstant %float 0.03125
%320 = OpConstantComposite %v3float %float_0_03125 %float_0_03125 %float_0_03125
%main = OpFunction %void None %4
%6 = OpLabel
OpLine %1 26 0
%20 = OpLoad %uint %gl_LocalInvocationIndex
%21 = OpBitcast %int %20
OpBranch %22
%22 = OpLabel
%321 = OpPhi %int %21 %6 %41 %23
%30 = OpSLessThan %bool %321 %int_6492
OpLoopMerge %24 %23 None
OpBranchConditional %30 %23 %24
%23 = OpLabel
%38 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %321
OpStore %38 %uint_0
%41 = OpIAdd %int %321 %int_1024
OpBranch %22
%24 = OpLabel
OpBranch %44
%44 = OpLabel
%322 = OpPhi %int %int_0 %24 %294 %81
%51 = OpSLessThan %bool %322 %int_5440
OpLoopMerge %46 %81 None
OpBranchConditional %51 %45 %46
%45 = OpLabel
OpLine %1 28 0
OpMemoryBarrier %uint_2 %uint_3400
OpLine %1 29 0
OpControlBarrier %uint_2 %uint_2 %uint_264
%59 = OpIMul %int %322 %int_1024
%60 = OpIAdd %int %21 %59
%64 = OpSMod %int %60 %int_1080
%67 = OpSDiv %int %60 %int_1080
%72 = OpIMul %int %64 %int_3
%73 = OpISub %int %67 %72
%75 = OpSLessThanEqual %bool %int_0 %73
%78 = OpSLessThan %bool %73 %int_1920
%79 = OpLogicalAnd %bool %75 %78
OpSelectionMerge %81 None
OpBranchConditional %79 %80 %81
%80 = OpLabel
%85 = OpIMul %int %67 %int_1082
%87 = OpIAdd %int %85 %64
%88 = OpSMod %int %87 %int_6492
OpLine %1 35 0
%96 = OpLoad %93 %texture0
%100 = OpCompositeConstruct %v2int %73 %64
%101 = OpImage %92 %96
%102 = OpImageFetch %v4float %101 %100 Lod %int_0
%103 = OpVectorShuffle %v3float %102 %102 0 1 2
%107 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %88
%108 = OpLoad %uint %107
%110 = OpIAdd %uint %108 %uint_2148008064
%113 = OpVectorTimesScalar %v3float %103 %float_255
%116 = OpShiftRightLogical %uint %110 %int_24
%118 = OpBitwiseAnd %uint %116 %uint_255
%119 = OpBitcast %int %118
%121 = OpISub %int %119 %int_128
%122 = OpConvertSToF %float %121
%125 = OpShiftRightLogical %uint %110 %int_12
%126 = OpBitwiseAnd %uint %125 %uint_255
%127 = OpBitcast %int %126
%128 = OpISub %int %127 %int_128
%129 = OpConvertSToF %float %128
%131 = OpBitwiseAnd %uint %110 %uint_255
%132 = OpBitcast %int %131
%133 = OpISub %int %132 %int_128
%134 = OpConvertSToF %float %133
%135 = OpCompositeConstruct %v3float %122 %129 %134
%138 = OpFMul %v3float %135 %314
%139 = OpFAdd %v3float %113 %138
OpStore %107 %uint_0
%144 = OpExtInst %v3float %2 Round %139
OpLine %1 40 0
%148 = OpLoad %145 %out_image
%154 = OpFMul %v3float %144 %317
%155 = OpCompositeExtract %float %154 0
%156 = OpCompositeExtract %float %154 1
%157 = OpCompositeExtract %float %154 2
%158 = OpCompositeConstruct %v4float %155 %156 %157 %float_0
OpImageWrite %148 %100 %158
%162 = OpFSub %v3float %139 %144
%163 = OpVectorTimesScalar %v3float %162 %float_254
%166 = OpFMul %v3float %163 %320
%172 = OpVectorTimesScalar %v3float %166 %float_2
%173 = OpExtInst %v3float %2 Round %172
%174 = OpConvertFToS %v3int %173
%176 = OpCompositeExtract %int %174 0
%178 = OpBitwiseAnd %int %176 %int_255
%179 = OpBitcast %uint %178
%180 = OpShiftLeftLogical %uint %179 %int_24
%183 = OpCompositeExtract %int %174 1
%184 = OpBitwiseAnd %int %183 %int_255
%185 = OpBitcast %uint %184
%186 = OpShiftLeftLogical %uint %185 %int_12
%187 = OpBitwiseOr %uint %180 %186
%189 = OpCompositeExtract %int %174 2
%190 = OpBitwiseAnd %int %189 %int_255
%191 = OpBitcast %uint %190
%192 = OpBitwiseOr %uint %187 %191
%195 = OpSGreaterThanEqual %bool %73 %int_2
OpSelectionMerge %197 None
OpBranchConditional %195 %196 %197
%196 = OpLabel
%200 = OpIAdd %int %88 %int_1083
%201 = OpSMod %int %200 %int_6492
%202 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %201
%204 = OpAtomicIAdd %uint %202 %uint_1 %uint_0 %192
OpBranch %197
%197 = OpLabel
%207 = OpIAdd %int %88 %int_5411
%208 = OpSMod %int %207 %int_6492
%209 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %208
%211 = OpAtomicIAdd %uint %209 %uint_1 %uint_0 %192
%214 = OpVectorTimesScalar %v3float %166 %float_4
%215 = OpExtInst %v3float %2 Round %214
%216 = OpConvertFToS %v3int %215
%218 = OpCompositeExtract %int %216 0
%219 = OpBitwiseAnd %int %218 %int_255
%220 = OpBitcast %uint %219
%221 = OpShiftLeftLogical %uint %220 %int_24
%223 = OpCompositeExtract %int %216 1
%224 = OpBitwiseAnd %int %223 %int_255
%225 = OpBitcast %uint %224
%226 = OpShiftLeftLogical %uint %225 %int_12
%227 = OpBitwiseOr %uint %221 %226
%229 = OpCompositeExtract %int %216 2
%230 = OpBitwiseAnd %int %229 %int_255
%231 = OpBitcast %uint %230
%232 = OpBitwiseOr %uint %227 %231
%235 = OpIAdd %int %88 %int_2164
%236 = OpSMod %int %235 %int_6492
%237 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %236
%239 = OpAtomicIAdd %uint %237 %uint_1 %uint_0 %232
%242 = OpSGreaterThanEqual %bool %73 %int_1
OpSelectionMerge %244 None
OpBranchConditional %242 %243 %244
%243 = OpLabel
%247 = OpIAdd %int %88 %int_2165
%248 = OpSMod %int %247 %int_6492
%249 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %248
%251 = OpAtomicIAdd %uint %249 %uint_1 %uint_0 %232
OpBranch %244
%244 = OpLabel
%254 = OpIAdd %int %88 %int_4329
%255 = OpSMod %int %254 %int_6492
%256 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %255
%258 = OpAtomicIAdd %uint %256 %uint_1 %uint_0 %232
%261 = OpVectorTimesScalar %v3float %166 %float_8
%262 = OpExtInst %v3float %2 Round %261
%263 = OpConvertFToS %v3int %262
%265 = OpCompositeExtract %int %263 0
%266 = OpBitwiseAnd %int %265 %int_255
%267 = OpBitcast %uint %266
%268 = OpShiftLeftLogical %uint %267 %int_24
%270 = OpCompositeExtract %int %263 1
%271 = OpBitwiseAnd %int %270 %int_255
%272 = OpBitcast %uint %271
%273 = OpShiftLeftLogical %uint %272 %int_12
%274 = OpBitwiseOr %uint %268 %273
%276 = OpCompositeExtract %int %263 2
%277 = OpBitwiseAnd %int %276 %int_255
%278 = OpBitcast %uint %277
%279 = OpBitwiseOr %uint %274 %278
%281 = OpIAdd %int %88 %int_1082
%282 = OpSMod %int %281 %int_6492
%283 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %282
%285 = OpAtomicIAdd %uint %283 %uint_1 %uint_0 %279
%288 = OpIAdd %int %88 %int_3247
%289 = OpSMod %int %288 %int_6492
%290 = OpAccessChain %_ptr_Workgroup_uint %err_rgb8 %289
%292 = OpAtomicIAdd %uint %290 %uint_1 %uint_0 %279
OpBranch %81
%81 = OpLabel
%294 = OpIAdd %int %322 %int_1
OpBranch %44
%46 = OpLabel
OpReturn
OpFunctionEnd
static const uint3 gl_WorkGroupSize = uint3(1024u, 1u, 1u);
Texture2D<float4> texture0 : register(t0);
SamplerState _texture0_sampler : register(s0);
RWTexture2D<float4> out_image : register(u0);
static uint gl_LocalInvocationIndex;
struct SPIRV_Cross_Input
{
uint gl_LocalInvocationIndex : SV_GroupIndex;
};
groupshared uint err_rgb8[6492];
void comp_main()
{
int _21 = int(gl_LocalInvocationIndex);
for (int _321 = _21; _321 < 6492; )
{
err_rgb8[_321] = 0u;
_321 += 1024;
continue;
}
for (int _322 = 0; _322 < 5440; _322++)
{
AllMemoryBarrier();
GroupMemoryBarrierWithGroupSync();
int _60 = _21 + (_322 * 1024);
int _64 = _60 % 1080;
int _67 = _60 / 1080;
int _73 = _67 - (_64 * 3);
if ((0 <= _73) && (_73 < 1920))
{
int _88 = ((_67 * 1082) + _64) % 6492;
int2 _100 = int2(_73, _64);
uint _108 = err_rgb8[_88];
uint _110 = _108 + 2148008064u;
float3 _139 = (texture0.Load(int3(_100, 0)).xyz * 255.0f) + (float3(float(int((_110 >> uint(24)) & 255u) - 128), float(int((_110 >> uint(12)) & 255u) - 128), float(int(_110 & 255u) - 128)) * 0.00393700785934925079345703125f.xxx);
err_rgb8[_88] = 0u;
float3 _144 = round(_139);
out_image[_100] = float4(_144 * 0.0039215688593685626983642578125f.xxx, 0.0f);
float3 _166 = ((_139 - _144) * 254.0f) * 0.03125f.xxx;
int3 _174 = int3(round(_166 * 2.0f));
uint _192 = ((uint(_174.x & 255) << uint(24)) | (uint(_174.y & 255) << uint(12))) | uint(_174.z & 255);
if (_73 >= 2)
{
uint _204;
InterlockedAdd(err_rgb8[(_88 + 1083) % 6492], _192, _204);
}
uint _211;
InterlockedAdd(err_rgb8[(_88 + 5411) % 6492], _192, _211);
int3 _216 = int3(round(_166 * 4.0f));
uint _232 = ((uint(_216.x & 255) << uint(24)) | (uint(_216.y & 255) << uint(12))) | uint(_216.z & 255);
uint _239;
InterlockedAdd(err_rgb8[(_88 + 2164) % 6492], _232, _239);
if (_73 >= 1)
{
uint _251;
InterlockedAdd(err_rgb8[(_88 + 2165) % 6492], _232, _251);
}
uint _258;
InterlockedAdd(err_rgb8[(_88 + 4329) % 6492], _232, _258);
int3 _263 = int3(round(_166 * 8.0f));
uint _279 = ((uint(_263.x & 255) << uint(24)) | (uint(_263.y & 255) << uint(12))) | uint(_263.z & 255);
uint _285;
InterlockedAdd(err_rgb8[(_88 + 1082) % 6492], _279, _285);
uint _292;
InterlockedAdd(err_rgb8[(_88 + 3247) % 6492], _279, _292);
continue;
}
continue;
}
}
[numthreads(1024, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
comp_main();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment