Skip to content

Instantly share code, notes, and snippets.

@justinruggles
Created July 30, 2012 13:32
Show Gist options
  • Save justinruggles/3206921 to your computer and use it in GitHub Desktop.
Save justinruggles/3206921 to your computer and use it in GitHub Desktop.
From 926c0121c9fd3b7234f175648db87bdb6e3c15e5 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 20 Jul 2012 19:53:40 -0400
Subject: [PATCH] dsputil: x86: use cpuflags for apply_window_int16()
---
libavcodec/x86/dsputil_mmx.c | 12 +++---
libavcodec/x86/dsputil_yasm.asm | 79 +++++++++++++++++---------------------
2 files changed, 41 insertions(+), 50 deletions(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index d9c8e96..a23e167 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2490,13 +2490,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
-void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_mmx2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
-void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_ba_mmx2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_ba_sse2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
@@ -2729,9 +2729,9 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
- c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
+ c->apply_window_int16 = ff_apply_window_int16_ba_mmx2;
} else {
- c->apply_window_int16 = ff_apply_window_int16_mmxext;
+ c->apply_window_int16 = ff_apply_window_int16_mmx2;
}
#endif
}
@@ -2913,7 +2913,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
- c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
+ c->apply_window_int16 = ff_apply_window_int16_ba_sse2;
} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index ea07644..291e7ad 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -208,24 +208,22 @@ SCALARPRODUCT_LOOP 0
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
-%macro REVERSE_WORDS_MMXEXT 1-2
- pshufw %1, %1, 0x1B
-%endmacro
-
-%macro REVERSE_WORDS_SSE2 1-2
+%macro REVERSE_WORDS 1-2
+%if cpuflag(ssse3) && notcpuflag(atom)
+ pshufb %1, %2
+%elif cpuflag(sse2)
pshuflw %1, %1, 0x1B
pshufhw %1, %1, 0x1B
pshufd %1, %1, 0x4E
-%endmacro
-
-%macro REVERSE_WORDS_SSSE3 2
- pshufb %1, %2
+%elif cpuflag(mmx2)
+ pshufw %1, %1, 0x1B
+%endif
%endmacro
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
-%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
+%macro MUL16FIXED 3 ; dst, src, temp
mova %3, %1
pmulhw %1, %2
pmullw %3, %2
@@ -234,22 +232,26 @@ SCALARPRODUCT_LOOP 0
por %1, %3
%endmacro
-; dst = ((dst * src) + (1<<14)) >> 15
-%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
- pmulhrsw %1, %2
-%endmacro
-
-%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
-cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
+%macro APPLY_WINDOW_INT16 1-2 ; %1=mmxext/sse2 bit exact version, %2=suffix
+cglobal apply_window_int16%2, 4,5,6, output, input, window, offset, offset2
lea offset2q, [offsetq-mmsize]
-%if %2
+%if cpuflag(ssse3) && notcpuflag(atom)
+ mova m2, [pb_revwords]
+%elif %1
mova m5, [pd_16384]
-%elifidn %1, ssse3
- mova m5, [pb_revwords]
- ALIGN 16
%endif
.loop:
-%if %2
+%if cpuflag(ssse3)
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The ssse3 version is bit-identical.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ pmulhrsw m1, m0
+ REVERSE_WORDS m0, m2
+ pmulhrsw m0, [ inputq+offsetq ]
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m0
+%elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
@@ -285,16 +287,6 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
psrad m2, 15
packssdw m0, m2
mova [outputq+offsetq], m0
-%elif %3
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The ssse3 version is bit-identical.
- mova m0, [windowq+offset2q]
- mova m1, [ inputq+offset2q]
- pmulhrsw m1, m0
- REVERSE_WORDS m0, m5
- pmulhrsw m0, [ inputq+offsetq ]
- mova [outputq+offset2q], m1
- mova [outputq+offsetq ], m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
@@ -314,21 +306,20 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
REP_RET
%endmacro
-INIT_MMX
-%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
-%define MUL16FIXED MUL16FIXED_MMXEXT
-APPLY_WINDOW_INT16 mmxext, 0, 0
-APPLY_WINDOW_INT16 mmxext_ba, 1, 0
-INIT_XMM
-%define REVERSE_WORDS REVERSE_WORDS_SSE2
-APPLY_WINDOW_INT16 sse2, 0, 0
-APPLY_WINDOW_INT16 sse2_ba, 1, 0
-APPLY_WINDOW_INT16 ssse3_atom, 0, 1
-%define REVERSE_WORDS REVERSE_WORDS_SSSE3
-APPLY_WINDOW_INT16 ssse3, 0, 1
+INIT_MMX mmx2
+APPLY_WINDOW_INT16 0
+APPLY_WINDOW_INT16 1, _ba
+INIT_XMM sse2
+APPLY_WINDOW_INT16 0
+APPLY_WINDOW_INT16 1, _ba
+INIT_XMM ssse3
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3,atom
+APPLY_WINDOW_INT16 1
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
+INIT_MMX
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
movq mm0, [topq]
movq mm2, mm0
--
1.7.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment