Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#define LOGO "RemoveGrain 1.0\n"
// An Avisynth plugin for removing grain from progressive video
//
// By Rainer Wittmann <gorw@gmx.de>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// To get a copy of the GNU General Public License write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
// http://www.gnu.org/copyleft/gpl.html .
//#define MODIFYPLUGIN 1// creat Repair plugin instead of RemoveGrain, 0 = compatible with RemoveGrain
//#define SHARPEN 1
//#define BLUR 1
//#define SSE2_TEST // ISSE2 version that can be used side by side with the SSE version
//#define DEBUG_NAME // for debugging
//#define ISSE 2 // P4, Athlon 64, Sempron 3100
//#define ISSE 3 // Prescott P4
//#define CVERSION // for debugging only
#define ALIGNPITCH
#define SMOOTH2
#define DEFAULT_MODE 2
#define DEFAULT_RGLIMIT 0
#define VC_EXTRALEAN
#include <Windows.h>
#include <stdio.h>
#include <stdarg.h>
#include "avisynth.h"
#include "planar.h"
static IScriptEnvironment *AVSenvironment;
#ifdef SSE2_TEST
#ifndef ISSE
#define ISSE 2
#endif
#ifndef DEBUG_NAME
#define DEBUG_NAME
#endif
#endif
#ifndef ISSE
#define ISSE 1
#endif
#if ISSE > 1
#define CPUFLAGS CPUF_SSE2
#else
#define CPUFLAGS CPUF_INTEGER_SSE
#endif
#ifdef MODIFYPLUGIN
#define MAXMODE 18
#elif defined(SHARPEN)
#define MAXMODE 22
#define MAXSTRENGTH 2
#define DEFAULT_STRENGTH 1
#else
#define MAXMODE 28
#endif
#if defined(SHARPEN) && defined(MODIFYPLUGIN)
#error "SHARPEN cannot be combined with MODIFYPLUGIN"
#endif
#if defined(BLUR) && defined(MODIFYPLUGIN)
#error "SHARPEN cannot be combined with MODIFYPLUGIN"
#endif
#if 1
void debug_printf(const char *format, ...)
{
char buffer[200];
va_list args;
va_start(args, format);
vsprintf(buffer, format, args);
va_end(args);
OutputDebugString(buffer);
}
#endif
#define COMPARE_MASK (~24)
static void CompareVideoInfo(VideoInfo &vi1, const VideoInfo &vi2, const char *progname)
{
if( (vi1.width != vi2.width) || (vi1.height != vi2.height) || ( (vi1.pixel_type & COMPARE_MASK) != (vi2.pixel_type & COMPARE_MASK) ))
{
#if 1
debug_printf("widths = %u, %u, heights = %u, %u, color spaces = %X, %X\n"
, vi1.width, vi2.width, vi1.height, vi2.height, vi1.pixel_type, vi2.pixel_type);
#endif
AVSenvironment->ThrowError("%s: clips must be of equal type", progname);
}
if(vi1.num_frames > vi2.num_frames) vi1.num_frames = vi2.num_frames;
}
#ifdef TESTCOMPARE
unsigned testcompare(const BYTE *dp, int dpitch, const BYTE *pp, int ppitch, int width, int height)
{
int i = height;
--dp; --pp;
unsigned diffsum = 0;
do
{
int j = width;
do
{
int diff = dp[j] - pp[j];
if( diff < 0 ) diff = -diff;
diffsum += diff;
} while( --j );
dp += dpitch;
pp += ppitch;
} while( --i );
return diffsum;
}
#define xpitch 1
void RemoveGrain(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int width, int height, int threshold)
{
int sinc = - (width + 1) * xpitch;
dpitch += sinc;
sinc += spitch;
do
{
dp[0] = sp[0];
dp += xpitch; sp += xpitch;
int i = width;
do
{
unsigned sort1[8];
int leq = 0;
int geq = 0;
unsigned x = sp[0];
if( (sort1[0] = (sp += xpitch)[0]) <= x )
{
if( sort1[0] == x ) ++geq;
++leq;
}
if( (sort1[1] = (sp += spitch)[0]) <= x )
{
if( sort1[1] == x ) ++geq;
++leq;
}
if( (sort1[2] = (sp -= xpitch)[0]) <= x )
{
if( sort1[2] == x ) ++geq;
++leq;
}
if( (sort1[3] = (sp -= xpitch)[0]) <= x )
{
if( sort1[3] >= x ) ++geq;
++leq;
}
if( (sort1[4] = (sp -= spitch)[0]) <= x )
{
if( sort1[4] >= x ) ++geq;
++leq;
}
if( (sort1[5] = (sp -= spitch)[0]) <= x )
{
if( sort1[5] >= x ) ++geq;
++leq;
}
if( (sort1[6] = (sp += xpitch)[0]) <= x )
{
if( sort1[6] >= x ) ++geq;
++leq;
}
if( (sort1[7] = (sp += xpitch)[0]) <= x )
{
if( sort1[7] >= x ) ++geq;
++leq;
}
if( ((geq += 8 - leq) < threshold) || (leq < threshold) )
{ // do a merge sort of sort1[8] as fast as possible
unsigned sort2[8];
if( sort1[1] < sort1[0] )
{
sort2[0] = sort1[1];
sort2[1] = sort1[0];
}
else
{
sort2[0] = sort1[0];
sort2[1] = sort1[1];
}
if( sort1[3] < sort1[2] )
{
sort2[2] = sort1[3];
sort2[3] = sort1[2];
}
else
{
sort2[2] = sort1[2];
sort2[3] = sort1[3];
}
if( sort1[5] < sort1[4] )
{
sort2[4] = sort1[5];
sort2[5] = sort1[4];
}
else
{
sort2[4] = sort1[4];
sort2[5] = sort1[5];
}
if( sort1[7] < sort1[6] )
{
sort2[6] = sort1[7];
sort2[7] = sort1[6];
}
else
{
sort2[6] = sort1[6];
sort2[7] = sort1[7];
}
if( sort2[0] > sort2[2] )
{
sort1[0] = sort2[2];
if( sort2[3] <= sort2[0] )
{
sort1[1] = sort2[3];
sort1[2] = sort2[0];
sort1[3] = sort2[1];
}
else
{
sort1[1] = sort2[0];
if( sort2[1] < sort2[3] )
{
sort1[2] = sort2[1];
sort1[3] = sort2[3];
}
else
{
sort1[2] = sort2[3];
sort1[3] = sort2[1];
}
}
}
else
{
sort1[0] = sort2[0];
if( sort2[1] <= sort2[2] )
{
sort1[1] = sort2[1];
sort1[2] = sort2[2];
sort1[3] = sort2[3];
}
else
{
sort1[1] = sort2[2];
if( sort2[3] < sort2[1] )
{
sort1[2] = sort2[3];
sort1[3] = sort2[1];
}
else
{
sort1[2] = sort2[1];
sort1[3] = sort2[3];
}
}
}
#if 0
if( (sort1[0] > sort1[1]) || (sort1[1] > sort1[2]) || (sort1[2] > sort1[3]) )
debug_printf("merge error: sort = %u, %u, %u, %u\n", sort1[0], sort1[1], sort1[2], sort1[3]);
#endif
if( sort2[4] > sort2[6] )
{
sort1[4] = sort2[6];
if( sort2[7] <= sort2[4] )
{
sort1[5] = sort2[7];
sort1[6] = sort2[4];
sort1[7] = sort2[5];
}
else
{
sort1[5] = sort2[4];
if( sort2[5] < sort2[7] )
{
sort1[6] = sort2[5];
sort1[7] = sort2[7];
}
else
{
sort1[6] = sort2[7];
sort1[7] = sort2[5];
}
}
}
else
{
sort1[4] = sort2[4];
if( sort2[5] <= sort2[6] )
{
sort1[5] = sort2[5];
sort1[6] = sort2[6];
sort1[7] = sort2[7];
}
else
{
sort1[5] = sort2[6];
if( sort2[7] < sort2[5] )
{
sort1[6] = sort2[7];
sort1[7] = sort2[5];
}
else
{
sort1[6] = sort2[5];
sort1[7] = sort2[7];
}
}
}
#if 0
if( (sort1[4] > sort1[5]) || (sort1[5] > sort1[6]) || (sort1[6] > sort1[7]) )
debug_printf("merge error: sort = %u, %u, %u, %u\n", sort1[4], sort1[5], sort1[6], sort1[7]);
#endif
unsigned *s1 = sort1, *s2 = sort1 + 4, *t = sort2;
*t++ = *s1 > *s2 ? *s2++ : *s1++;
*t++ = *s1 > *s2 ? *s2++ : *s1++;
*t++ = *s1 > *s2 ? *s2++ : *s1++;
if( sort1[3] > sort1[7] )
{
do
{
*t++ = *s1 > *s2 ? *s2++ : *s1++;
} while( s2 != sort1 + 8 );
do
{
*t++ = *s1++;
} while( s1 != sort1 + 4 );
}
else
{
do
{
*t++ = *s1 > *s2 ? *s2++ : *s1++;
} while( s1 != sort1 + 4 );
do
{
*t++ = *s2++;
} while( s2 != sort1 + 8 );
}
#if 0
if( (leq > 0) && (sort2[leq - 1] > x) ) debug_printf("leq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", leq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
if( (leq < 8) && (sort2[leq] <= x) ) debug_printf("leq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", leq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
if( (geq > 0) && (sort2[8 - geq] < x) ) debug_printf("geq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", geq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
if( (geq < 8) && (sort2[7 - geq] >= x) ) debug_printf("geq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", geq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
#endif
x = leq < threshold ? sort2[threshold - 1] : sort2[8 - threshold];
}
dp[0] = x;
dp += xpitch;
sp += spitch;
} while( --i );
dp[0] = sp[0];
dp += dpitch; sp += sinc;
} while( --height );
}
#undef xpitch
#endif // TESTCOMPARE
#if ISSE > 1
#define SSE_INCREMENT 16
#define SSE_SHIFT 4
#define SSE_MOVE movdqu
#if ISSE > 2
#define SSE3_MOVE lddqu
#else
#define SSE3_MOVE movdqu
#endif
#define SSE_RMOVE movdqa
#define SSE0 xmm0
#define SSE1 xmm1
#define SSE2 xmm2
#define SSE3 xmm3
#define SSE4 xmm4
#define SSE5 xmm5
#define SSE6 xmm6
#define SSE7 xmm7
#define SSE_EMMS
#else
#define SSE_INCREMENT 8
#define SSE_SHIFT 3
#define SSE_MOVE movq
#define SSE3_MOVE movq
#define SSE_RMOVE movq
#define SSE0 mm0
#define SSE1 mm1
#define SSE2 mm2
#define SSE3 mm3
#define SSE4 mm4
#define SSE5 mm5
#define SSE6 mm6
#define SSE7 mm7
#define SSE_EMMS __asm emms
#endif // ISSE
#if defined(SHARPEN) || defined(BLUR)
#define SHLUR
#endif
#if BLUR == 1
#define blur(center, min, max, reg1, reg2)\
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw max, 1 \
__asm psrlw min, 1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm paddusb center, reg2 \
__asm psubusb center, reg1
#elif BLUR == 2
__asm pminub center, max \
__asm pmaxub center, min \
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw max, 1 \
__asm psrlw min, 1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm paddusb center, reg2 \
__asm psubusb center, reg1
#endif
#ifdef SHARPEN
static const __declspec(align(SSE_INCREMENT)) unsigned short rshift[3][SSE_INCREMENT / 2] =
{
{
0,0,0,0
#if SSE_INCREMENT == 16
,0,0,0,0
#endif
},
{
1,0,0,0
#if SSE_INCREMENT == 16
, 0,0,0,0
#endif
},
{
2,0,0,0
#if SSE_INCREMENT == 16
, 0,0,0,0
#endif
}
};
#define SHIFT_MASK0 255
#define SHIFT_MASK1 127
#define SHIFT_MASK2 63
static const __declspec(align(SSE_INCREMENT)) BYTE shift_mask[3][SSE_INCREMENT] =
{
{
SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0
#if SSE_INCREMENT == 16
, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0
#endif
},
{
SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#if SSE_INCREMENT == 16
, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#endif
},
{
SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2
#if SSE_INCREMENT == 16
, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2
#endif
}
};
#if SHARPEN == 1
// only sharpen
#define sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)\
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw reg2, rshift \
__asm psrlw reg1, rshift \
__asm pand reg2, SHIFT_MASK1 \
__asm pand reg1, SHIFT_MASK1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm psubusb center, reg2 \
__asm paddusb center, reg1
#elif SHARPEN == 2
// clip and sharpen
#define sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)\
__asm pminub center, max \
__asm pmaxub center, min \
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw reg2, rshift \
__asm psrlw reg1, rshift \
__asm pand reg2, SHIFT_MASK1 \
__asm pand reg1, SHIFT_MASK1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm psubusb center, reg2 \
__asm paddusb center, reg1
#endif
#endif // SHARPEN
#ifdef BLUR
#define sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2) blur(center, min, max, reg1, reg2)
#endif
#ifdef SHARPEN
void do_nothing(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
{
}
void copy_plane(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
{
AVSenvironment->BitBlt(dp, dpitch, sp, spitch, hblocks * SSE_INCREMENT + 2 * (SSE_INCREMENT + 1) + remainder, height);
}
#else // SHARPEN
void do_nothing(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
}
void copy_plane(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
AVSenvironment->BitBlt(dp, dpitch, sp, spitch, hblocks * SSE_INCREMENT + 2 * (SSE_INCREMENT + 1) + remainder, height);
}
#endif // SHARPEN
#define ins2(first, second, reg) \
__asm pmaxub second, reg \
__asm pminub second, first \
__asm pmaxub first, reg
#define ins3(first, second, third, reg) \
__asm pmaxub third, reg \
__asm pminub third, second \
ins2(first, second, reg)
#define ins4(first, second, third, fourth, reg) \
__asm pmaxub fourth, reg \
__asm pminub fourth, third \
ins3(first, second, third, reg)
#define ins5(first, second, third, fourth, fifth, reg) \
__asm pmaxub fifth, reg \
__asm pminub fifth, fourth \
ins4(first, second, third, fourth, reg)
#define ins6(first, second, third, fourth, fifth, sixth, reg) \
__asm pmaxub sixth, reg \
__asm pminub sixth, fifth \
ins5(first, second, third, fourth, fifth, reg)
#define add2(first, second, reg) \
__asm SSE_RMOVE second, reg \
__asm pminub second, first \
__asm pmaxub first, reg
#define add3(first, second, third, reg) \
__asm SSE_RMOVE third, reg \
__asm pminub third, second \
ins2(first, second, reg)
#define add4(first, second, third, fourth, reg) \
__asm SSE_RMOVE fourth, reg \
__asm pminub fourth, third \
ins3(first, second, third, reg)
#define add5(first, second, third, fourth, fifth, reg) \
__asm SSE_RMOVE fifth, reg \
__asm pminub fifth, fourth \
ins4(first, second, third, fourth, reg)
#define add6(first, second, third, fourth, fifth, sixth, reg) \
__asm SSE_RMOVE sixth, reg \
__asm pminub sixth, fifth \
ins5(first, second, third, fourth, fifth, reg)
#define sub2(first, second, val) \
__asm pmaxub second, val \
__asm pminub second, first
#define sub3(first, second, third, reg) \
__asm pmaxub third, reg \
__asm pminub third, second \
sub2(first, second, reg)
#define sub4(first, second, third, fourth, reg) \
__asm pmaxub fourth, reg \
__asm pminub fourth, third \
sub3(first, second, third, reg)
#define sub5(first, second, third, fourth, fifth, reg) \
__asm pmaxub fifth, reg \
__asm pminub fifth, fourth \
sub4(first, second, third, fourth, reg)
#define sub6(first, second, third, fourth, fifth, sixth, reg) \
__asm pmaxub sixth, reg \
__asm pminub sixth, fifth \
sub5(first, second, third, fourth, fifth, reg)
#define minmax1(min, max, val) \
__asm pminub min, val \
__asm pmaxub max, val
#define minmax2(max1, max2, min2, min1, reg) \
__asm pminub min2, reg \
__asm pmaxub max2, reg \
__asm pmaxub min2, min1 \
__asm pminub max2, max1 \
__asm pminub min1, reg \
__asm pmaxub max1, reg
#define minmax3(max1, max2, max3, min3, min2, min1, reg)\
__asm pminub min3, reg \
__asm pmaxub max3, reg \
__asm pmaxub min3, min2 \
__asm pminub max3, max2 \
minmax2(max1, max2, min2, min1, reg)
#define minmax2sub(max1, max2, min2, min1, val) \
__asm pminub min2, val \
__asm pmaxub max2, val \
__asm pmaxub min2, min1 \
__asm pminub max2, max1
#define minmax3sub(max1, max2, max3, min3, min2, min1, reg)\
__asm pminub min3, reg \
__asm pmaxub max3, reg \
__asm pmaxub min3, min2 \
__asm pminub max3, max2 \
minmax2sub(max1, max2, min2, min1, reg)
#ifdef SHARPEN
void SSE_RemoveGrain4(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain4(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm movd [edi], SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef SHLUR
sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
__asm SSE_MOVE [edi + 1], SSE5
#else // SHLUR
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE3, SSE4
__asm SSE_MOVE [edi + 1], SSE3
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
sub6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub5(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE0, [edi]
#endif
sub4(SSE2, SSE3, SSE4, SSE5, SSE7)
#if ISSE > 1
__asm pmaxub SSE5, SSE0
#else
__asm pmaxub SSE5, [edi]
#endif
__asm add esi, SSE_INCREMENT
__asm pminub SSE3, SSE5
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef SHLUR
sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE4, [edi]
#else // ISSE > 1
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm add esi, SSE_INCREMENT
__asm pminub SSE3, SSE4
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif //MODIFYPLUGIN == 1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
sub6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub5(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE0, [edi]
#endif
sub4(SSE2, SSE3, SSE4, SSE5, SSE7)
#if ISSE > 1
__asm pmaxub SSE5, SSE0
#else
__asm pmaxub SSE5, [edi]
#endif
__asm pminub SSE3, SSE5
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef SHLUR
sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE4, [edi]
#else // ISSE > 1
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE4
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif //MODIFYPLUGIN == 1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SSE_RemoveGrain1(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain1(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
#ifdef SHARPEN
__asm mov ebx, strength
__asm SSE_RMOVE SSE2, rshift[ebx]
__asm SSE_RMOVE SSE3, shift_mask[ebx]
#endif
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE5, [esi + 1]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 1]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 2]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE4, [esi + ebx]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE5, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE4)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
__asm movd [edi], SSE4 // only for saving the first byte
minmax1(SSE0, SSE1, SSE5)
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
__asm SSE_MOVE [edi + 1], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE5, [esi + 1]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + ebx]
minmax1(SSE0, SSE1, SSE4)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE5)
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
minmax1(SSE0, SSE1, SSE4)
#else // MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE6)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
minmax1(SSE0, SSE1, SSE5)
#endif // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
#if ((MODIFYPLUGIN == 1) && (ISSE > 1)) || defined(SHLUR)
__asm dec ecx
__asm jnz middle_loop
#else
__asm loop middle_loop
#endif
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE5, [esi + 1]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + ebx]
minmax1(SSE0, SSE1, SSE4)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE5)
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
minmax1(SSE0, SSE1, SSE4)
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE5, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE6)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE5 // only for saving the last byte
#endif
minmax1(SSE0, SSE1, SSE5)
#endif // MODIFYPLUGIN == 1
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
__asm add esi, eax
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SSE_RemoveGrain2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm movd [edi], SSE7
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef SHLUR
sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
__asm SSE_MOVE [edi + 1], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
#endif
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE4)
#endif
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef SHLUR
sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE2, [edi]
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE1, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
#endif
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE4)
#endif
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE6
#endif
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef SHLUR
sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE2, [edi]
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE1, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE1
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SSE_RemoveGrain3(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain3(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm movd [edi], SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef SHLUR
sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
__asm SSE_MOVE [edi + 1], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
__asm pminub SSE3, SSE2
__asm SSE_MOVE [edi + 1], SSE3
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
minmax3(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE6, [edi]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE7)
#if ISSE > 1
__asm pmaxub SSE3, SSE6
#else
__asm pmaxub SSE3, [edi]
#endif
__asm pminub SSE3, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef SHLUR
sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE3, [edi]
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif // MODIFYPLUGIN == 1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
minmax3(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE6, [edi]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE7)
#if ISSE > 1
__asm pmaxub SSE3, SSE6
#else
__asm pmaxub SSE3, [edi]
#endif
__asm pminub SSE3, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // ISSE > 1
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef SHLUR
sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE3, [edi]
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif // MODIFYPLUGIN == 1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
// if( weight2[i] <= weight1[i] ) { value1[i] = value2[i]; weight1[i] = weight2[i]; }
// value2 remains unchanged
// weight2 must be a SSE register, value1, value2, weight1 may very well be a memory variables
// but value1 and weight1 should be registers because they are used twice
#define mergeweighted(value1, weight1, value2, weight2) \
__asm pminub weight1, weight2 \
__asm pcmpeqb weight2, weight1 \
__asm psubusb value1, weight2 \
__asm pand weight2, value2 \
__asm por value1, weight2
#define merge2weighted(val1, val2, weight1, val1b, val2b, weight2) \
__asm pminub weight1, weight2 \
__asm pcmpeqb weight2, weight1 \
__asm psubusb val1, weight2 \
__asm psubusb val2, weight2 \
__asm pand val1b, weight2 \
__asm pand val2b, weight2 \
__asm por val1, val1b \
__asm por val2, val2b
#ifndef SHLUR
#if MODIFYPLUGIN > 0
#define diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub reg1, center \
__asm psubusb reg2, newp \
__asm pminub newp, oldp \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = |oldp - newp|
// oldp is left unchanged
#define diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm pminub newp, oldp \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2
#endif
#ifdef MODIFYPLUGIN
#define diagweightw5(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight5, but in addition bound2 is written to wmem
#define diagweightw5(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm pminub newp, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2
#endif // MODIFYPLUGIN
void diag5(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw5(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw5(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if MODIFYPLUGIN > 0
#define diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub weight, center \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, weight \
__asm paddusb weight, reg1
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = 2*|oldp - newp| + |bound1 - bound2|
// oldp is left unchanged
#define diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, weight \
__asm paddusb weight, reg1
#endif
#ifdef MODIFYPLUGIN
#define diagweightw6(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight6, but in addition bound2 is written to wmem
#define diagweightw6(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, weight \
__asm paddusb weight, reg1
#endif // MODIFYPLUGIN
void diag6(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw6(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw6(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if MODIFYPLUGIN > 0
#define diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub weight, center \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = |oldp - newp| + |bound1 - bound2|
// oldp is left unchanged
#define diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif
#ifdef MODIFYPLUGIN
#define diagweightw7(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight7, but in addition bound2 is written to wmem
#define diagweightw7(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif // MODIFYPLUGIN
void diag7(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw7(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw7(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
void diag7b(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
__asm movd SSE6, [esi + ebx]
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm movd [edi], SSE6
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
__asm movd SSE6, [esi + ebx + 6]
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm movd [edi + 5], SSE6
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if MODIFYPLUGIN > 0
#define diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub weight, center \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm paddusb reg1, reg1 \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = |oldp - newp| + 2*|bound1 - bound2|
// oldp is left unchanged
#define diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm paddusb reg1, reg1 \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif
#ifdef MODIFYPLUGIN
#define diagweightw8(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight8, but in addition bound2 is written to wmem
#define diagweightw8(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm paddusb reg1, reg1 \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif // MODIFYPLUGIN
void diag8(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw8(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw8(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#endif // #ifndef SHLUR
#if MODIFYPLUGIN > 0
#define get_min_weight(min, weight, center, mem1, mem2,reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm pminub min, center \
__asm pmaxub weight, center \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm psubusb weight, min
#else
#define get_min_weight(min, weight, center, mem1, mem2,reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm psubusb weight, min
#endif
#ifdef MODIFYPLUGIN
#define get_min_weightw(min, weight, center, mem1, mem2, wmem, reg) get_min_weight(min, weight, center, mem1, mem2,reg)
#else
#define get_min_weightw(min, weight, center, mem1, mem2, wmem, reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm SSE_MOVE wmem, reg \
__asm psubb weight, min
#endif // MODIFYPLUGIN
#ifdef SHARPEN
void diag9(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void diag9(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
#ifdef SHARPEN
__asm mov ebx, strength
__asm SSE_RMOVE SSE4, rshift[ebx]
__asm SSE_RMOVE SSE6, shift_mask[ebx]
#endif
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
get_min_weight(SSE0, SSE1, SSE5, [esi], [esi + 2*ebx + 2], SSE7)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weightw(SSE2, SSE3, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
__asm paddusb SSE1, SSE0
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
__asm SSE_MOVE [edi + 1], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
get_min_weight(SSE0, SSE1, SSE5,[esi], [esi + 2*ebx + 2], SSE7)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + ebx + 2], [esi + ebx], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // (ISSE > 1) || defined(SHLUR)
__asm paddusb SSE1, SSE0
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
get_min_weight(SSE0, SSE1, SSE5, [esi], [esi + 2*ebx + 2], SSE7)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weightw(SSE2, SSE3, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // (ISSE > 1) || defined(SHLUR)
__asm paddusb SSE1, SSE0
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
__asm add esi, eax
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#define get_val_weight(val, weight, mem, center, reg) \
__asm SSE3_MOVE val, mem \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, center \
__asm pmaxub weight, val \
__asm pminub reg, val \
__asm psubusb weight, reg
#ifdef MODIFYPLUGIN
#define get_val_weightw(val, weight, mem, center, wmem, reg) get_val_weight(val, weight, mem, center, reg)
#else
#define get_val_weightw1(val, weight, mem, center, wmem, reg) \
__asm SSE3_MOVE val, mem \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, center \
__asm pmaxub weight, val \
__asm pminub reg, val \
__asm movd wmem, val \
__asm psubusb weight, reg
#define get_val_weightw(val, weight, mem, center, wmem, reg) \
__asm SSE3_MOVE val, mem \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, center \
__asm pmaxub weight, val \
__asm pminub reg, val \
__asm SSE_MOVE wmem, val \
__asm psubusb weight, reg
#endif // MODIFYPLUGIN
void SSE_RemoveGrain10(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
get_val_weightw1(SSE2, SSE3, [esi + ebx], SSE1, [edi], SSE7)
get_val_weight(SSE4, SSE5, [esi + ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
__asm SSE_MOVE SSE4, SSE2
__asm pminub SSE1, SSE2
__asm pmaxub SSE1, SSE4
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE1, [edi]
#else
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
#endif
get_val_weight(SSE2, SSE3, [esi + ebx], SSE1, SSE7)
#if MODIFYPLUGIN == 1
get_val_weight(SSE4, SSE5, [esi + ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
#endif
get_val_weightw(SSE4, SSE5, [esi + ebx + 2], SSE1, [edi + 1], SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
__asm SSE_MOVE SSE4, SSE2
__asm pminub SSE1, SSE2
__asm add esi, SSE_INCREMENT
__asm pmaxub SSE1, SSE4
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE1, [edi]
#else
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
#endif
get_val_weight(SSE2, SSE3, [esi + ebx], SSE1, SSE7)
#if MODIFYPLUGIN == 1
get_val_weight(SSE4, SSE5, [esi + ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
#endif
get_val_weight(SSE4, SSE5, [esi + ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
__asm SSE_MOVE SSE4, SSE2
__asm pminub SSE1, SSE2
__asm pmaxub SSE1, SSE4
__asm add esi, eax
__asm SSE_MOVE [edi], SSE1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if !(defined(MODIFYPLUGIN) || defined(SHLUR))
#define neighbourdiff(minus, plus, center1, center2, neighbour, nullreg) \
__asm SSE_RMOVE center1, center2 \
__asm psubusb center2, neighbour \
__asm psubusb neighbour, center1 \
__asm SSE_RMOVE minus, center2 \
__asm SSE_RMOVE plus, neighbour \
__asm pcmpeqb center2, nullreg \
__asm pcmpeqb neighbour, nullreg \
__asm por minus, center2 \
__asm pand center2, neighbour \
__asm por plus, neighbour \
__asm psubusb minus, center2 \
__asm psubusb plus, center2
#define neighbourdiff_w(minus, plus, center1, center2, dest, neighbour, nullreg, mwrite) \
__asm SSE_RMOVE center1, center2 \
__asm mwrite dest, neighbour \
__asm psubusb center2, neighbour \
__asm psubusb neighbour, center1 \
__asm SSE_RMOVE minus, center2 \
__asm SSE_RMOVE plus, neighbour \
__asm pcmpeqb center2, nullreg \
__asm pcmpeqb neighbour, nullreg \
__asm por minus, center2 \
__asm pand center2, neighbour \
__asm por plus, neighbour \
__asm psubusb minus, center2 \
__asm psubusb plus, center2
#define SHIFT_MASK1 127
static const __declspec(align(SSE_INCREMENT)) BYTE shift_mask[SSE_INCREMENT] =
{
SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#if SSE_INCREMENT == 16
, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#endif
};
#define sharpen(center, minus, plus, reg1, reg2)\
__asm SSE_RMOVE reg1, minus \
__asm SSE_RMOVE reg2, plus \
__asm psubusb reg1, plus \
__asm psubusb reg2, minus \
__asm psrlw plus, 1 \
__asm psrlw minus, 1 \
__asm pand plus, shift_mask \
__asm pand minus, shift_mask \
__asm pminub plus, reg1 \
__asm pminub minus, reg2 \
__asm paddusb center, plus \
__asm psubusb center, minus
void nondestructivesharpen(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm mov edx, remainder
__asm pxor SSE0, SSE0
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
__asm SSE3_MOVE SSE3, [esi + ebx]
neighbourdiff_w(SSE4, SSE5, SSE2, SSE1, [edi], SSE3, SSE0, movd)
__asm SSE3_MOVE SSE3, [esi + ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 1]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 1]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
__asm SSE3_MOVE SSE3, [esi + ebx]
neighbourdiff(SSE4, SSE5, SSE2, SSE1, SSE3, SSE0)
__asm SSE3_MOVE SSE3, [esi + ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 1]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 1]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm add esi, SSE_INCREMENT
sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
__asm SSE3_MOVE SSE3, [esi + ebx]
neighbourdiff(SSE4, SSE5, SSE2, SSE1, SSE3, SSE0)
__asm SSE3_MOVE SSE3, [esi + ebx + 2]
neighbourdiff_w(SSE6, SSE7, SSE1, SSE2, [edi + 1], SSE3, SSE0, SSE_MOVE)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 1]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 1]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm add esi, eax
sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
__asm SSE_MOVE [edi], SSE1
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
}
#endif !(defined(MODIFYPLUGIN) || defined(SHLUR))
#ifndef MODIFYPLUGIN
#define convolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr + spitch + 1] \
__asm SSE_MOVE reg1, reg0 \
__asm punpcklbw reg0, nullreg \
__asm punpckhbw reg1, nullreg \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm paddusw reg0, reg0 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg1 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 1] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm paddusw reg0, reg0 \
__asm paddusw reg1, reg1 \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg1, reg5 \
__asm paddusw reg0, bias_correction \
__asm paddusw reg1, bias_correction \
__asm psraw reg0, 4 \
__asm psraw reg1, 4 \
__asm packuswb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
#define convolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr + spitch + 1] \
__asm SSE_MOVE reg1, reg0 \
__asm punpcklbw reg0, nullreg \
__asm punpckhbw reg1, nullreg \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm paddusw reg0, reg0 \
__asm SSE_MOVE reg3, reg2 \
__asm movd [daddr], reg2 \
__asm paddusw reg1, reg1 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 1] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm paddusw reg0, reg0 \
__asm paddusw reg1, reg1 \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg1, reg5 \
__asm paddusw reg0, bias_correction \
__asm paddusw reg1, bias_correction \
__asm psraw reg0, 4 \
__asm psraw reg1, 4 \
__asm packuswb reg0, reg1 \
__asm SSE_MOVE [daddr + 1], reg0
#define convolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr + spitch + 1] \
__asm SSE_MOVE reg1, reg0 \
__asm punpcklbw reg0, nullreg \
__asm punpckhbw reg1, nullreg \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm paddusw reg0, reg0 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg1 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm SSE_MOVE [daddr + 1], reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 1] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm paddusw reg0, reg0 \
__asm paddusw reg1, reg1 \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg1, reg5 \
__asm paddusw reg0, bias_correction \
__asm paddusw reg1, bias_correction \
__asm psraw reg0, 4 \
__asm psraw reg1, 4 \
__asm packuswb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
static const __declspec(align(SSE_INCREMENT)) unsigned short convolution_bias[SSE_INCREMENT/2] =
{
8,8,8,8
#if SSE_INCREMENT == 16
,8,8,8,8
#endif
};
void SSE_RemoveGrain11(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = (2*(_sp[spitch] + 2 * _sp[spitch + 1] + _sp[spitch + 2] + _sp[1] + _sp[2 * spitch + 1])
+ _sp[0] + _sp[2] + _sp[2 * spitch] + _sp[2 * spitch + 2] + 8) / 16;
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, convolution_bias
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
convolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
convolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
convolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
#endif
}
#define flatconvolution(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE_RMOVE reg2, reg0 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg0, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + ebx] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + ebx + 1] \
__asm SSE3_MOVE reg1, [saddr + ebx + 2] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2*ebx] \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 1] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 2] \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm paddusw reg0, flatconvolution_bias \
__asm paddusw reg2, flatconvolution_bias \
__asm pmulhuw reg0, onenineth \
__asm pmulhuw reg2, onenineth \
__asm packuswb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define flatconvolution_w1(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE_RMOVE reg2, reg0 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg0, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + ebx] \
__asm SSE_RMOVE reg5, reg4 \
__asm movd [daddr], reg1 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + ebx + 1] \
__asm SSE3_MOVE reg1, [saddr + ebx + 2] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2*ebx] \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 1] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 2] \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm paddusw reg0, flatconvolution_bias \
__asm paddusw reg2, flatconvolution_bias \
__asm pmulhuw reg0, onenineth \
__asm pmulhuw reg2, onenineth \
__asm packuswb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg0
#define flatconvolution_w2(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE_RMOVE reg2, reg0 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg0, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + ebx] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + ebx + 1] \
__asm SSE3_MOVE reg1, [saddr + ebx + 2] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_MOVE [daddr + 1], reg1 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2*ebx] \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 1] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 2] \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm paddusw reg0, flatconvolution_bias \
__asm paddusw reg2, flatconvolution_bias \
__asm pmulhuw reg0, onenineth \
__asm pmulhuw reg2, onenineth \
__asm packuswb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define FLATBIAS 4
static const __declspec(align(SSE_INCREMENT)) unsigned short flatconvolution_bias[SSE_INCREMENT/2] =
{
FLATBIAS, FLATBIAS, FLATBIAS, FLATBIAS
#if SSE_INCREMENT == 16
, FLATBIAS, FLATBIAS, FLATBIAS, FLATBIAS
#endif
};
#define ONENINETH (unsigned short)(((1u << 16) + 4) / 9)
static const __declspec(align(SSE_INCREMENT)) unsigned short onenineth[SSE_INCREMENT/2] =
{
ONENINETH, ONENINETH, ONENINETH, ONENINETH
#if SSE_INCREMENT == 16
, ONENINETH, ONENINETH, ONENINETH, ONENINETH
#endif
};
void SSE_RemoveGrain20(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = (BYTE)((_sp[0] + _sp[1] + _sp[2] + _sp[spitch] + _sp[spitch + 1] + _sp[spitch + 2] + _sp[2 * spitch]
+ _sp[2 * spitch + 1] + _sp[2 * spitch + 2] + 4) / 9);
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, onenineth
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
flatconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
flatconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
flatconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
#endif
}
#if ISSE > 1
#define fconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg1 \
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 1]\
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + spitch + 1]\
__asm pavgb reg0, reg2 \
__asm pavgb reg4, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg1, reg4 \
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
#define fconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg1 \
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 1]\
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg3 \
__asm movd [daddr], reg4 \
__asm SSE3_MOVE reg1, [saddr + spitch + 1]\
__asm pavgb reg0, reg2 \
__asm pavgb reg4, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg1, reg4 \
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr + 1], reg0
#define fconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg1 \
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 1]\
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + spitch + 1]\
__asm pavgb reg0, reg2 \
__asm pavgb reg4, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg1, reg4 \
__asm SSE_MOVE [daddr + 1], reg5 \
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
#else
#define fconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg2, [saddr + 2*spitch + 2]\
__asm pavgb reg0, [saddr + 1] \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm pavgb reg0, reg2 \
__asm pavgb reg4, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg4, [saddr + spitch + 1]\
__asm pavgb reg0, reg4 \
__asm SSE_MOVE [daddr], reg0
#define fconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg2, [saddr + 2*spitch + 2]\
__asm pavgb reg0, [saddr + 1] \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm pavgb reg0, reg2 \
__asm movd [daddr], reg4 \
__asm pavgb reg4, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg4, [saddr + spitch + 1]\
__asm pavgb reg0, reg4 \
__asm SSE_MOVE [daddr + 1], reg0
#define fconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg2, [saddr + 2*spitch + 2]\
__asm pavgb reg0, [saddr + 1] \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg4, [saddr + spitch + 2]\
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg4 \
__asm pavgb reg4, [saddr + spitch] \
__asm psubusb reg0, bias_correction \
__asm pavgb reg4, [saddr + spitch + 1]\
__asm pavgb reg0, reg4 \
__asm SSE_MOVE [daddr], reg0
#endif // ISSE
static const __declspec(align(SSE_INCREMENT)) unsigned char fconvolution_bias[SSE_INCREMENT] =
{
1,1,1,1,1,1,1,1
#if SSE_INCREMENT == 16
,1,1,1,1,1,1,1,1
#endif
};
void SSE_RemoveGrain12(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = ((((_sp[0] + _sp[2] + 1) / 2 + _sp[1] + 1) / 2 + ((_sp[2*spitch] + _sp[2*spitch + 2] + 1) / 2 + _sp[2*spitch + 1] + 1) / 2 + 1)/2
+ ((_sp[spitch] + _sp[spitch + 2] + 1) / 2 + _sp[spitch + 1] + 1) / 2) / 2;
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, fconvolution_bias
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
fconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
fconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
fconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
#endif
}
#if ISSE > 1
#define rconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg2 \
__asm pavgb reg1, reg3 \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg4 \
__asm pavgb reg3, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define rconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg2 \
__asm pavgb reg1, reg3 \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm movd [daddr], reg2 \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg4 \
__asm pavgb reg3, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg0
#define rconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg2 \
__asm pavgb reg1, reg3 \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg4 \
__asm pavgb reg3, reg5 \
__asm SSE_MOVE [daddr + 1], reg5 \
__asm pavgb reg2, reg3 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#else
#define rconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg1, [saddr + 2*spitch + 2]\
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm pavgb reg3, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define rconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg1, [saddr + 2*spitch + 2]\
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm movd [daddr], reg2 \