Skip to content

Instantly share code, notes, and snippets.

Last active September 28, 2024 14:11
Show Gist options
  • Save agyild/7e8951915b2bf24526a9343d951db214 to your computer and use it in GitHub Desktop.
Save agyild/7e8951915b2bf24526a9343d951db214 to your computer and use it in GitHub Desktop.
NVIDIA Image Scaling v1.0.2 for mpv
// The MIT License(MIT)
// Copyright(c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files(the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies of
// the Software, and to permit persons to whom the Software is furnished to do so,
// subject to the following conditions :
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// NVIDIA Image Scaling v1.0.2 by NVIDIA
// ported to mpv by agyild
// Changelog
// Made it directly operate on LUMA plane, since the original shader was operating
// on LUMA by deriving it from RGB. This should cause a major increase in performance,
// especially on OpenGL 4.0+ renderers
//!BIND coef_scaler
//!BIND coef_usm
//!DESC NVIDIA Image Scaling and Sharpening v1.0.2
//!COMPUTE 32 24 256 1
//!WHEN OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 >
// User variables
#define SHARPNESS 0.25 // Amount of sharpening. 0.0 to 1.0.
#define NIS_THREAD_GROUP_SIZE 256 // May be set to 128 for better performance on NVIDIA hardware, otherwise set to 256. Don't forget to modify the COMPUTE directive accordingly as well (e.g., COMPUTE 32 24 128 1).
#define NIS_HDR_MODE 0 // Must be set to 1 for content with PQ colorspace. 0 or 1.
// Constant variables
#define NIS_BLOCK_WIDTH 32
#define kPhaseCount 64
#define kFilterSize 6
#define kSupportSize 6
#define kPadSize kSupportSize
#define NIS_SCALE_INT 1
#define NIS_SCALE_FLOAT 1.0f
#define kTilePitch (NIS_BLOCK_WIDTH + kPadSize)
#define kTileSize (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
#define kEdgeMapPitch (NIS_BLOCK_WIDTH + 2)
#define kEdgeMapSize (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))
const float sharpen_slider = clamp(SHARPNESS, 0.0f, 1.0f) - 0.5f;
const float MaxScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.75f;
const float MinScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
const float LimitScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
const float kDetectRatio = 2 * 1127.f / 1024.f;
const float kDetectThres = (bool(NIS_HDR_MODE) ? 32.0f : 64.0f) / 1024.0f;
const float kMinContrastRatio = bool(NIS_HDR_MODE) ? 1.5f : 2.0f;
const float kMaxContrastRatio = bool(NIS_HDR_MODE) ? 5.0f : 10.0f;
const float kSharpStartY = bool(NIS_HDR_MODE) ? 0.35f : 0.45f;
const float kSharpEndY = bool(NIS_HDR_MODE) ? 0.55f : 0.9f;
const float kSharpStrengthMin = max(0.0f, 0.4f + sharpen_slider * MinScale * (bool(NIS_HDR_MODE) ? 1.1f : 1.2));
const float kSharpStrengthMax = ((bool(NIS_HDR_MODE) ? 2.2f : 1.6f) + sharpen_slider * MaxScale * 1.8f);
const float kSharpLimitMin = max((bool(NIS_HDR_MODE) ? 0.06f :0.1f), (bool(NIS_HDR_MODE) ? 0.1f : 0.14f) + sharpen_slider * LimitScale * (bool(NIS_HDR_MODE) ? 0.28f : 0.32f));
const float kSharpLimitMax = ((bool(NIS_HDR_MODE) ? 0.6f : 0.5f) + sharpen_slider * LimitScale * 0.6f);
const float kRatioNorm = 1.0f / (kMaxContrastRatio - kMinContrastRatio);
const float kSharpScaleY = 1.0f / (kSharpEndY - kSharpStartY);
const float kSharpStrengthScale = kSharpStrengthMax - kSharpStrengthMin;
const float kSharpLimitScale = kSharpLimitMax - kSharpLimitMin;
const float kContrastBoost = 1.0f;
const float kEps = 1.0f;
#define kScaleX (HOOKED_size.x / target_size.x)
#define kScaleY (HOOKED_size.y / target_size.y)
#define kSrcNormX HOOKED_pt.x
#define kSrcNormY HOOKED_pt.y
// HLSL to GLSL macros
#define saturate(x) clamp(x, 0, 1)
#define lerp(a, b, x) mix(a, b, x)
// CS Shared variables
shared float shPixelsY[kTileSize];
shared float shCoefScaler[kPhaseCount][kFilterSize];
shared float shCoefUSM[kPhaseCount][kFilterSize];
shared vec4 shEdgeMap[kEdgeMapSize];
// Shader code
vec4 GetEdgeMap(float p[4][4], int i, int j) {
const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
const float g_0_90_max = max(g_0, g_90);
const float g_0_90_min = min(g_0, g_90);
const float g_45_135_max = max(g_45, g_135);
const float g_45_135_min = min(g_45, g_135);
float e_0_90 = 0;
float e_45_135 = 0;
if (g_0_90_max + g_45_135_max == 0)
return vec4(0, 0, 0, 0);
e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
e_45_135 = 1.0f - e_0_90;
bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
bool c_g_0_90 = g_0_90_max == g_0;
bool c_g_45_135 = g_45_135_max == g_45;
float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
return vec4(weight_0, weight_90, weight_45, weight_135);
void LoadFilterBanksSh(int i0, int di) {
// Load up filter banks to shared memory
// The work is spread over (kPhaseCount * 2) threads
for (int i = i0; i < kPhaseCount * 2; i += di)
int phase = i >> 1;
int vIdx = i & 1;
vec4 v = vec4(texelFetch(coef_scaler, ivec2(vIdx, phase), 0));
int filterOffset = vIdx * 4;
shCoefScaler[phase][filterOffset + 0] = v.x;
shCoefScaler[phase][filterOffset + 1] = v.y;
if (vIdx == 0)
shCoefScaler[phase][2] = v.z;
shCoefScaler[phase][3] = v.w;
v = vec4(texelFetch(coef_usm, ivec2(vIdx, phase), 0));
shCoefUSM[phase][filterOffset + 0] = v.x;
shCoefUSM[phase][filterOffset + 1] = v.y;
if (vIdx == 0)
shCoefUSM[phase][2] = v.z;
shCoefUSM[phase][3] = v.w;
float CalcLTI(float p0, float p1, float p2, float p3, float p4, float p5, int phase_index)
const bool selector = (phase_index <= kPhaseCount / 2);
float sel = selector ? p0 : p3;
const float a_min = min(min(p1, p2), sel);
const float a_max = max(max(p1, p2), sel);
sel = selector ? p2 : p5;
const float b_min = min(min(p3, p4), sel);
const float b_max = max(max(p3, p4), sel);
const float a_cont = a_max - a_min;
const float b_cont = b_max - b_min;
const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
vec4 GetInterpEdgeMap(const vec4 edge[2][2], float phase_frac_x, float phase_frac_y)
vec4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
vec4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
return lerp(h0, h1, phase_frac_y);
float EvalPoly6(const float pxl[6], int phase_int)
float y = 0.f;
for (int i = 0; i < 6; ++i)
y += shCoefScaler[phase_int][i] * pxl[i];
float y_usm = 0.f;
for (int i = 0; i < 6; ++i)
y_usm += shCoefUSM[phase_int][i] * pxl[i];
// let's compute a piece-wise ramp based on luma
const float y_scale = 1.0f - saturate((y * (1.0f / NIS_SCALE_FLOAT) - kSharpStartY) * kSharpScaleY);
// scale the ramp to sharpen as a function of luma
const float y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin;
y_usm *= y_sharpness;
// scale the ramp to limit USM as a function of luma
const float y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y;
y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm));
// reduce ringing
y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int);
return y + y_usm;
float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_int)
float h_acc = 0.0f;
for (int j = 0; j < 6; ++j)
float v_acc = 0.0f;
for (int i = 0; i < 6; ++i)
v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
// let's return the sum unpacked -> we can accumulate it later
return h_acc;
float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int, vec4 w)
float f = 0.f;
if (w.x > 0.0f)
// 0 deg filter
float interp0Deg[6];
for (int i = 0; i < 6; ++i)
interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
if (w.y > 0.0f)
// 90 deg filter
float interp90Deg[6];
for (int i = 0; i < 6; ++i)
interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
if (w.z > 0.0f)
//45 deg filter
float pphase_b45;
pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
float temp_interp45Deg[7];
temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
pphase_b45 = pphase_b45 - 0.5f;
float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
float interp45Deg[6];
float pphase_p45 = phase_x_frac + phase_y_frac;
if (pphase_p45 >= 1)
for (int i = 0; i < 6; i++)
interp45Deg[i] = temp_interp45Deg[i + 1];
pphase_p45 = pphase_p45 - 1;
for (int i = 0; i < 6; i++)
interp45Deg[i] = temp_interp45Deg[i];
f += EvalPoly6(interp45Deg, int(pphase_p45 * 64)) * w.z;
if (w.w > 0.0f)
//135 deg filter
float pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
float temp_interp135Deg[7];
temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
pphase_b135 = pphase_b135 - 0.5f;
float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
float interp135Deg[6];
float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
if (pphase_p135 >= 1)
for (int i = 0; i < 6; ++i)
interp135Deg[i] = temp_interp135Deg[i + 1];
pphase_p135 = pphase_p135 - 1;
for (int i = 0; i < 6; ++i)
interp135Deg[i] = temp_interp135Deg[i];
f += EvalPoly6(interp135Deg, int(pphase_p135 * 64)) * w.w;
return f;
void hook()
uvec2 blockIdx = gl_WorkGroupID.xy;
uint threadIdx = gl_LocalInvocationID.x;
// Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
int dstBlockX = int(NIS_BLOCK_WIDTH * blockIdx.x);
int dstBlockY = int(NIS_BLOCK_HEIGHT * blockIdx.y);
const int srcBlockStartX = int(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
const int srcBlockStartY = int(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
const int srcBlockEndX = int(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
const int srcBlockEndY = int(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));
int numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
int numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
// round-up load region to even size since we're loading in 2x2 batches
numTilePixelsX += numTilePixelsX & 0x1;
numTilePixelsY += numTilePixelsY & 0x1;
const int numTilePixels = numTilePixelsX * numTilePixelsY;
// calculate the equivalent values for the edge map
const int numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
const int numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
const int numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;
// fill in input luma tile (shPixelsY) in batches of 2x2 pixels
// we use texture gather to get extra support necessary
// to compute 2x2 edge map outputs too
for (uint i = threadIdx * 2; i < uint(numTilePixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2)
uint py = (i / numTilePixelsX) * 2;
uint px = i % numTilePixelsX;
// 0.5 to be in the center of texel
// - (kSupportSize - 1) / 2 to shift by the kernel support size
float kShift = 0.5f - (kSupportSize - 1) / 2;
const float tx = (srcBlockStartX + px + kShift) * kSrcNormX;
const float ty = (srcBlockStartY + py + kShift) * kSrcNormY;
float p[2][2];
#ifdef HOOKED_gather
const vec4 sY = HOOKED_gather(vec2(tx, ty), 0);
p[0][0] = sY.w;
p[0][1] = sY.z;
p[1][0] = sY.x;
p[1][1] = sY.y;
for (int j = 0; j < 2; j++)
for (int k = 0; k < 2; k++)
const float px = HOOKED_tex(vec2(tx + k * kSrcNormX, ty + j * kSrcNormY)).r;
p[j][k] = px;
const uint idx = py * kTilePitch + px;
shPixelsY[idx] = float(p[0][0]);
shPixelsY[idx + 1] = float(p[0][1]);
shPixelsY[idx + kTilePitch] = float(p[1][0]);
shPixelsY[idx + kTilePitch + 1] = float(p[1][1]);
// fill in the edge map of 2x2 pixels
for (uint i = threadIdx * 2; i < uint(numEdgeMapPixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2)
uint py = (i / numEdgeMapPixelsX) * 2;
uint px = i % numEdgeMapPixelsX;
const uint edgeMapIdx = py * kEdgeMapPitch + px;
uint tileCornerIdx = (py+1) * kTilePitch + px + 1;
float p[4][4];
for (int j = 0; j < 4; j++)
for (int k = 0; k < 4; k++)
p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
shEdgeMap[edgeMapIdx] = vec4(GetEdgeMap(p, 0, 0));
shEdgeMap[edgeMapIdx + 1] = vec4(GetEdgeMap(p, 0, 1));
shEdgeMap[edgeMapIdx + kEdgeMapPitch] = vec4(GetEdgeMap(p, 1, 0));
shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = vec4(GetEdgeMap(p, 1, 1));
LoadFilterBanksSh(int(threadIdx), NIS_THREAD_GROUP_SIZE);
// output coord within a tile
const ivec2 pos = ivec2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
// x coord inside the output image
const int dstX = dstBlockX + pos.x;
// x coord inside the input image
const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
// nearest integer part
const int px = int(floor(srcX) - srcBlockStartX);
// fractional part
const float fx = srcX - floor(srcX);
// discretized phase
const int fx_int = int(fx * kPhaseCount);
// y coord inside the output image
const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
// y coord inside the input image
const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
// nearest integer part
const int py = int(floor(srcY) - srcBlockStartY);
// fractional part
const float fy = srcY - floor(srcY);
// discretized phase
const int fy_int = int(fy * kPhaseCount);
// generate weights for directional filters
const int startEdgeMapIdx = py * kEdgeMapPitch + px;
vec4 edge[2][2];
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
// need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
const vec4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
// load 6x6 support to regs
const int startTileIdx = py * kTilePitch + px;
float p[6][6];
for (int i = 0; i < 6; ++i)
for (int j = 0; j < 6; ++j)
p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
// weigth for luma
const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
// final luma is a weighted product of directional & normal filters
float opY = 0;
// get traditional scaler filter output
opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
// get directional filter bank output
opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
// do bilinear tap for luma upscaling
vec4 op = HOOKED_tex(vec2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY));
const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - op.r;
op.x += corr;
imageStore(out_image, ivec2(dstX, dstY), op);
//!TEXTURE coef_scaler
//!SIZE 2 64
//!FORMAT rgba32f
//!TEXTURE coef_usm
//!SIZE 2 64
//!FORMAT rgba32f
// The MIT License(MIT)
// Copyright(c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files(the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies of
// the Software, and to permit persons to whom the Software is furnished to do so,
// subject to the following conditions :
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// NVIDIA Image Scaling v1.0.2 by NVIDIA
// ported to mpv by agyild
// Changelog
// Made it directly operate on LUMA plane, since the original shader was operating
// on LUMA by deriving it from RGB.
//!DESC NVIDIA Image Sharpening v1.0.2
//!COMPUTE 32 32 256 1
//!WHEN OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 > ! OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 < ! *
// User variables
#define SHARPNESS 0.25 // Amount of sharpening. 0.0 to 1.0.
#define NIS_THREAD_GROUP_SIZE 256 // May be set to 128 for better performance on NVIDIA hardware, otherwise set to 256. Don't forget to modify the COMPUTE directive accordingly as well (e.g., COMPUTE 32 32 128 1).
#define NIS_HDR_MODE 0 // Must be set to 1 for content with PQ colorspace. 0 or 1.
// Constant variables
#define NIS_BLOCK_WIDTH 32
#define kSupportSize 5
#define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1)
#define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1)
const float sharpen_slider = clamp(SHARPNESS, 0.0f, 1.0f) - 0.5f;
const float MaxScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.75f;
const float MinScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
const float LimitScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
const float kDetectRatio = 2 * 1127.f / 1024.f;
const float kDetectThres = (bool(NIS_HDR_MODE) ? 32.0f : 64.0f) / 1024.0f;
const float kMinContrastRatio = bool(NIS_HDR_MODE) ? 1.5f : 2.0f;
const float kMaxContrastRatio = bool(NIS_HDR_MODE) ? 5.0f : 10.0f;
const float kSharpStartY = bool(NIS_HDR_MODE) ? 0.35f : 0.45f;
const float kSharpEndY = bool(NIS_HDR_MODE) ? 0.55f : 0.9f;
const float kSharpStrengthMin = max(0.0f, 0.4f + sharpen_slider * MinScale * (bool(NIS_HDR_MODE) ? 1.1f : 1.2));
const float kSharpStrengthMax = ((bool(NIS_HDR_MODE) ? 2.2f : 1.6f) + sharpen_slider * MaxScale * 1.8f);
const float kSharpLimitMin = max((bool(NIS_HDR_MODE) ? 0.06f :0.1f), (bool(NIS_HDR_MODE) ? 0.1f : 0.14f) + sharpen_slider * LimitScale * (bool(NIS_HDR_MODE) ? 0.28f : 0.32f)); //
const float kSharpLimitMax = ((bool(NIS_HDR_MODE) ? 0.6f : 0.5f) + sharpen_slider * LimitScale * 0.6f);
const float kRatioNorm = 1.0f / (kMaxContrastRatio - kMinContrastRatio);
const float kSharpScaleY = 1.0f / (kSharpEndY - kSharpStartY);
const float kSharpStrengthScale = kSharpStrengthMax - kSharpStrengthMin;
const float kSharpLimitScale = kSharpLimitMax - kSharpLimitMin;
const float kContrastBoost = 1.0f;
const float kEps = 1.0f / 255.0f;
#define kSrcNormX HOOKED_pt.x
#define kSrcNormY HOOKED_pt.y
#define kDstNormX kSrcNormX
#define kDstNormY kSrcNormY
// HLSL to GLSL macros
#define saturate(x) clamp(x, 0, 1)
#define lerp(a, b, x) mix(a, b, x)
// CS Shared variables
shared float shPixelsY[kNumPixelsY][kNumPixelsX];
// Shader code
vec4 GetEdgeMap(float p[5][5], int i, int j) {
const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
const float g_0_90_max = max(g_0, g_90);
const float g_0_90_min = min(g_0, g_90);
const float g_45_135_max = max(g_45, g_135);
const float g_45_135_min = min(g_45, g_135);
float e_0_90 = 0;
float e_45_135 = 0;
if (g_0_90_max + g_45_135_max == 0)
return vec4(0, 0, 0, 0);
e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
e_45_135 = 1.0f - e_0_90;
bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
bool c_g_0_90 = g_0_90_max == g_0;
bool c_g_45_135 = g_45_135_max == g_45;
float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
return vec4(weight_0, weight_90, weight_45, weight_135);
float CalcLTIFast(const float y[5]) {
const float a_min = min(min(y[0], y[1]), y[2]);
const float a_max = max(max(y[0], y[1]), y[2]);
const float b_min = min(min(y[2], y[3]), y[4]);
const float b_max = max(max(y[2], y[3]), y[4]);
const float a_cont = a_max - a_min;
const float b_cont = b_max - b_min;
const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
float EvalUSM(const float pxl[5], const float sharpnessStrength, const float sharpnessLimit) {
// USM profile
float y_usm = -0.6001f * pxl[1] + 1.2002f * pxl[2] - 0.6001f * pxl[3];
// boost USM profile
y_usm *= sharpnessStrength;
// clamp to the limit
y_usm = min(sharpnessLimit, max(-sharpnessLimit, y_usm));
// reduce ringing
y_usm *= CalcLTIFast(pxl);
return y_usm;
vec4 GetDirUSM(const float p[5][5]) {
// sharpness boost & limit are the same for all directions
const float scaleY = 1.0f - saturate((p[2][2] - kSharpStartY) * kSharpScaleY);
// scale the ramp to sharpen as a function of luma
const float sharpnessStrength = scaleY * kSharpStrengthScale + kSharpStrengthMin;
// scale the ramp to limit USM as a function of luma
const float sharpnessLimit = (scaleY * kSharpLimitScale + kSharpLimitMin) * p[2][2];
vec4 rval;
// 0 deg filter
float interp0Deg[5];
for (int i = 0; i < 5; ++i)
interp0Deg[i] = p[i][2];
rval.x = EvalUSM(interp0Deg, sharpnessStrength, sharpnessLimit);
// 90 deg filter
float interp90Deg[5];
for (int i = 0; i < 5; ++i)
interp90Deg[i] = p[2][i];
rval.y = EvalUSM(interp90Deg, sharpnessStrength, sharpnessLimit);
//45 deg filter
float interp45Deg[5];
interp45Deg[0] = p[1][1];
interp45Deg[1] = lerp(p[2][1], p[1][2], 0.5f);
interp45Deg[2] = p[2][2];
interp45Deg[3] = lerp(p[3][2], p[2][3], 0.5f);
interp45Deg[4] = p[3][3];
rval.z = EvalUSM(interp45Deg, sharpnessStrength, sharpnessLimit);
//135 deg filter
float interp135Deg[5];
interp135Deg[0] = p[3][1];
interp135Deg[1] = lerp(p[3][2], p[2][1], 0.5f);
interp135Deg[2] = p[2][2];
interp135Deg[3] = lerp(p[2][3], p[1][2], 0.5f);
interp135Deg[4] = p[1][3];
rval.w = EvalUSM(interp135Deg, sharpnessStrength, sharpnessLimit);
return rval;
void hook() {
uvec2 blockIdx = gl_WorkGroupID.xy;
uint threadIdx = gl_LocalInvocationID.x;
const int dstBlockX = int(NIS_BLOCK_WIDTH * blockIdx.x);
const int dstBlockY = int(NIS_BLOCK_HEIGHT * blockIdx.y);
// fill in input luma tile in batches of 2x2 pixels
// we use texture gather to get extra support necessary
// to compute 2x2 edge map outputs too
const float kShift = 0.5f - kSupportSize / 2;
for (int i = int(threadIdx) * 2; i < kNumPixelsX * kNumPixelsY / 2; i += NIS_THREAD_GROUP_SIZE * 2) {
uvec2 pos = uvec2(uint(i) % uint(kNumPixelsX), uint(i) / uint(kNumPixelsX) * 2);
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
const float tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX;
const float ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY;
const float px = HOOKED_tex(vec2(tx, ty)).r;
shPixelsY[pos.y + dy][pos.x + dx] = px;
for (int k = int(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE)
const ivec2 pos = ivec2(uint(k) % uint(NIS_BLOCK_WIDTH), uint(k) / uint(NIS_BLOCK_WIDTH));
// load 5x5 support to regs
float p[5][5];
for (int i = 0; i < 5; ++i)
for (int j = 0; j < 5; ++j)
p[i][j] = shPixelsY[pos.y + i][pos.x + j];
// get directional filter bank output
vec4 dirUSM = GetDirUSM(p);
// generate weights for directional filters
vec4 w = GetEdgeMap(p, kSupportSize / 2 - 1, kSupportSize / 2 - 1);
// final USM is a weighted sum filter outputs
const float usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
// do bilinear tap and correct luma texel so it produces new sharpened luma
const int dstX = dstBlockX + pos.x;
const int dstY = dstBlockY + pos.y;
vec4 op = HOOKED_tex(vec2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY));
op.x += usmY;
imageStore(out_image, ivec2(dstX, dstY), op);
Copy link

askasys-code commented Feb 14, 2022

How this is supposed to be used?

I have this config


is this necessary?
I noticed is uneffective with and without this.
This shader uses bilinear by default according the mpv infos i.

One last question, NVSharpen.glsl is supposed to be used with NVScaler.glsl or they are standalone?
I'm using also your FSR and CAS shaders.
Finally modern upscalers can be used on mpv, thank you very much for the work!

Copy link

agyild commented Feb 14, 2022

How this is supposed to be used?

I have this config


is this necessary?
I noticed is uneffective with and without this.
This shader uses bilinear by default according the mpv infos i.

One last question, NVSharpen.glsl is supposed to be used with NVScaler.glsl or they are standalone?
I'm using also your FSR and CAS shaders.
Finally modern upscalers can be used on mpv, thank you very much for the work!

The configuration is okay, if you are only intending to run NVScaler, hwdec and gpu-api is not required. And yes, NVScaler and NVSharpen are standalone shaders, as the former includes its own sharpening algorithm. And you are not supposed to use multiple scaling algorithms, because only one of them will be able to process the non-upscaled frame, and the rest will not work. Just use a single shader, depending on your needs. The shader does not use bilinear, it's just when the content is upscaled to the output resolution, mpv automatically switches to bilinear if needed for more performance. Also don't forget that you can modify sharpening amount in all of the shaders. You can confirm if the shaders are working by pressing 2 after pressing i to display statistics overlay.

Copy link

CrHasher commented Feb 17, 2022

How does one display more info in statistics overlay? Can't fit the info that appears on my screen after pressing i+2

Edit: Answer use arrow keys Up or Down

Copy link

Obegg commented Mar 20, 2022

So the better one from those two is NVScaler.glsl?

I wonder which one is better? FSRCNNX_x2_16-0-4-1.glsl or NVScaler.glsl?

Copy link

agyild commented Mar 21, 2022

So the better one from those two is NVScaler.glsl?

They serve different purposes. NVScaler is for upscaling, NVSharpen is just for sharpening and it only works when the luma channel is at the same size as the output (i.e., non-scaled and downscaled luma).

I wonder which one is better? FSRCNNX_x2_16-0-4-1.glsl or NVScaler.glsl?

Depends. FSRCNN requires high quality sharp input and nearly 2x area scaling to give accurate results because that is what it is trained on. It is challenging to make AI-based algorithms generic because neural networks are usually good at doing one specific thing. NVScaler/FSR/CAS are generic algorithms which means they will work the same way on every input which might result in generally satisfying results but in cases where the source content is exactly like FSRCNN is trained on, then FSRCNN will give more accurate results. Note that accurate and satisfying are not the same, that's why you see DLSS and FSR 2.0 making claims such as "making it even better than native rendering".

tl;dr: FSRCNN is better only if you give it something good to begin with. Don't expect it to turn trash into a jewel so to speak. NVScaler/FSR/CAS will do their best to turn even trash something more pleasing to watch.

Copy link

Are these hardware-independent? Is there a way to use these on pre OpenGL 4.0 hardware like FSR?


Copy link

0042 commented Jan 5, 2023

Hi, the shader is bugged or disabled as the window width is reduced. For example, (with panscan=1) a 16:9 video at 1/2 window width has artifacts, while a 16:9 video at 1/4 window width will disable the shader.
Could you share what must be changed for it to function regardless of mpv window size? I'm using a single shader at a time because I prefer to toggle them. FSR.glsl doesn't seem to have this issue btw. Thank you

Copy link

is this the same solution that nvidia uses for RTX Video Super Resolution?

Copy link

@DavidEscalante No, NIS has no AI involved while VSR does. However I've still found NIS to be better in things like mpv than VSR is on YouTube, maybe it's due to YouTube's compression.

Copy link

askasys-code commented May 14, 2023

I've an issue with NIS, but I don't know if it's only my case.
With vo=gpu-next plus video-rotate, nvidia driver crashes with a black screen, I have to alt-tab to restore the desktop, that's pretty funny, since nvidia studio drivers should be stable like rocks. No issue so far with FSR and other upscalers.
With no options about vo , it will rotate only the croma, not the entire video.

Copy link

@askasys-code there's nothing particularly stable about nvidia studio drivers, nor is there anything more stable compared to game ready drivers, lmao. Also, you're using gpu-next which is pretty much in the experimental stage right now, so crashes, bugs and glitches of various kinds are to be expected.

Copy link

askasys-code commented Jun 4, 2023

@askasys-code there's nothing particularly stable about nvidia studio drivers, nor is there anything more stable compared to game ready drivers, lmao. Also, you're using gpu-next which is pretty much in the experimental stage right now, so crashes, bugs and glitches of various kinds are to be expected.

I specified this

With no options about vo , it will rotate only the croma, not the entire video.

However, there's still an issue, but it's also a non-issue, since it's a particular use, I mean, very few people rotate the video during playback with the NIS shader activated, it's a particular combination, that's why with gpu-next the driver crashes I guess XD
Thank you for the reply.

Copy link

CrHasher commented Jun 6, 2023

You can disable shaders on rotate like this:

profile-cond=(p["video-params/rotate"] == 0)

Use the glsl-shaders-clr part wisely ;)

Copy link

seamtex commented Jun 18, 2023

How do i reduce the ringing artifacts? FSR has built in de-ringing.

Copy link

agyild commented Jun 18, 2023

How do i reduce the ringing artifacts? FSR has built in de-ringing.

NIS does not have built-in deringing. You might want to try adding antiring.hook after NVScaler.glsl and configuring the radius parameter for both passes to achieve the desired effect.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment