Skip to content

Instantly share code, notes, and snippets.

@luc65r
Last active August 20, 2021 19:47
Show Gist options
  • Save luc65r/fd31c3b5c2aab2eabeff08e126d6751b to your computer and use it in GitHub Desktop.
Save luc65r/fd31c3b5c2aab2eabeff08e126d6751b to your computer and use it in GitHub Desktop.
libavutil tx x86 assembly
;******************************************************************************
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
%if ARCH_X86_64
%define ptr resq
%else
%define ptr resd
%endif
%assign i 16
%rep 14
cextern cos_ %+ i %+ _double ; ff_cos_i_double...
%assign i (i << 1)
%endrep
struc AVTXContext
.n: resd 1 ; Non-power-of-two part
.m: resd 1 ; Power-of-two part
.inv: resd 1 ; Is inverse
.type: resd 1 ; Type
.flags: resq 1 ; Flags
.scale: resq 1 ; Scale
.exptab: ptr 1 ; MDCT exptab
.tmp: ptr 1 ; Temporary buffer needed for all compound transforms
.pfatab: ptr 1 ; Input/Output mapping for compound transforms
.revtab: ptr 1 ; Input mapping for power of two transforms
.inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms
.arch_priv: ptr 1 ; Arch-specific private data, will be av_free()'d on exit
endstruc
SECTION_RODATA 32
%define POS 0x0000000000000000
%define NEG 0x8000000000000000
%define M_SQRT1_2 0.707106781186547524401
%define COS16_1 0.92387950420379638671875
%define COS16_3 0.3826834261417388916015625
mask_ppmm: dq POS, POS, NEG, NEG
mask_pppm: dq POS, POS, POS, NEG
mask_pmpm: dq POS, NEG, POS, NEG
mask_pmpp: dq POS, NEG, POS, POS
mask_mppm: dq NEG, POS, POS, NEG
mult_sqrt: dq M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
mult_even1: dq 1.0, 1.0, M_SQRT1_2, M_SQRT1_2
mult_even2: dq 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
mult_odd11: dq COS16_1, COS16_1, COS16_3, COS16_3
mult_odd12: dq COS16_1, -COS16_1, COS16_3, -COS16_3
mult_odd21: dq COS16_3, -COS16_3, COS16_1, -COS16_1
mult_odd22: dq -COS16_3, -COS16_3, -COS16_1, -COS16_1
SECTION .text
; %1 - in, out
; %2 - in, out
; %3 - tmp
%macro FFT4 3
subpd %3, %1, %2
addpd %1, %2
vinsertf128 %2, %1, xmm%3, 1
vpermpd %3, %3, q3223
vpermpd %1, %1, q3232
vinsertf128 %1, %1, xmm%3, 1
subpd %3, %2, %1
addpd %1, %2
vextractf128 xmm%2, %1, 1
vinsertf128 %1, %1, xmm%3, 1
vinsertf128 %3, %3, xmm%2, 0
vpermpd %2, %3, q1230
%endmacro
; %1 - in, out
; %2 - in, out
; %3 - in, out
; %4 - in, out
; %5 - tmp
; %6 - tmp
; %7 - tmp
%macro FFT8 7
subpd %5, %1, %3 ; r1234
subpd %6, %2, %4 ; r5678
addpd %1, %3 ; q1234
addpd %2, %4 ; q5678
movapd %7, [mask_ppmm]
vinsertf128 %3, %1, xmm%1, 1 ; q1212
vinsertf128 %4, %2, xmm%2, 1 ; q5656
vpermpd %1, %1, q3232 ; q3434
vpermpd %2, %2, q3232 ; q7878
xorpd %1, %7
xorpd %2, %7
addpd %3, %1 ; s1234
addpd %4, %2 ; s5678
shufpd %4, %4, q0012 ; s5687
xorpd %4, [mask_pppm]
addpd %1, %3, %4 ; q1234
subpd %2, %3, %4 ; q5678
movapd %7, [mask_pmpm]
vpermpd %3, %5, q0220
vpermpd %4, %6, q0220
vpermpd %5, %5, q3113
vpermpd %6, %6, q1331
xorpd %5, %7
xorpd %6, %7
addpd %5, %3 ; z1234
addpd %6, %4 ; z5678
vpermpd %4, %6, q0330
vpermpd %6, %6, q1221
xorpd %4, %7
addpd %6, %4 ; t5678
shufpd %5, %5, q0012
xorpd %3, %5, [mask_pmpp] ; z1243
movapd %4, %3
mulpd %6, [mult_sqrt]
addpd %3, %6
subpd %4, %6
%endmacro
; %1 - in, out
; %2 - in, out
; %3 - in, out
; %4 - in, out
; %5 - in, out
; %6 - in, out
; %7 - in, out
; %8 - in, out
; %9 - tmp
; %10 - tmp
; %11 - tmp
; %12 - tmp
%macro FFT16 12
FFT8 %1, %2, %3, %4, %9, %10, %11
FFT4 %5, %7, %9
FFT4 %6, %8, %9
vperm2f128 %9, %5, %5, q0120
vperm2f128 %10, %6, %6, q0120
vpermilpd %9, %9, 0b0101
vpermilpd %10, %10, 0b0101
xorpd %9, [mask_ppmm]
addsubpd %5, %9
addsubpd %6, %10
vpermilpd %9, %7, 0b0101
vpermilpd %10, %8, 0b0101
mulpd %7, [mult_odd11]
mulpd %8, [mult_odd12]
%if cpuflag(fma3)
fmaddpd %7, %9, [mult_odd21], %7
fmaddpd %8, %10, [mult_odd22], %8
%else
mulpd %9, [mult_odd21]
mulpd %10, [mult_odd22]
addpd %7, %9
addpd %8, %10
%endif
mulpd %5, [mult_even1]
mulpd %6, [mult_even2]
xorpd %10, %6, [mask_pmpm]
xorpd %12, %8, [mask_pmpm]
addsubpd %6, %5
addsubpd %8, %7
addpd %9, %5, %10
addpd %11, %7, %12
vpermilpd %10, %6, 0b0101
vpermilpd %12, %8, 0b0101
addpd %5, %3, %11
addpd %6, %4, %12
subpd %7, %3, %11
subpd %8, %4, %12
subpd %3, %1, %9
subpd %4, %2, %10
addpd %1, %9
addpd %2, %10
%endmacro
%macro SPLIT_RADIX_COMBINE 0
%endmacro
%macro LOAD_LUT 1
%assign i 0
%rep %1 / 2
%assign j 0
%rep 2
mov tmpd, [lutq + (i * 2 + j) * 4]
imul tmpd, 2 * 8
vinsertf128 m %+ i, m %+ i, [inq + tmpq], j
%assign j j+1
%endrep
%assign i i+1
%endrep
%endmacro
%macro EXTRACT_OUT 1
%assign i 0
%rep 2
%assign j 0
%rep %1 / 4
%assign k 0
%rep 2
%assign l j + i * (%1 / 4)
vextractf128 [outq + (2 * j + k) * 4 * 8 + i * 2 * 8], m %+ l, k
%assign k k+1
%endrep
%assign j j+1
%endrep
%assign i i+1
%endrep
%endmacro
INIT_YMM avx
cglobal fft4_double, 4, 4, 3, lut, out, in, tmp
mov lutq, [lutq + AVTXContext.revtab]
LOAD_LUT 4
FFT4 m0, m1, m2
EXTRACT_OUT 4
RET
INIT_YMM avx
cglobal fft8_double, 4, 4, 7, lut, out, in, tmp
mov lutq, [lutq + AVTXContext.revtab]
LOAD_LUT 8
FFT8 m0, m1, m2, m3, m4, m5, m6
EXTRACT_OUT 8
RET
%macro FFT16_FN 1
INIT_YMM %1
cglobal fft16_double, 4, 4, 12, lut, out, in, tmp
mov lutq, [lutq + AVTXContext.revtab]
LOAD_LUT 16
FFT16 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11
EXTRACT_OUT 16
RET
%endmacro
FFT16_FN avx
FFT16_FN fma3
INIT_YMM avx
cglobal fft32_double, 4, 4, 16, lut, out, in, tmp
mov lutq, [lutq + AVTXContext.revtab]
%assign i 0
%rep 4
%assign j 0
%rep 2
mov tmpd, [lutq + ((i + 8) * 2 + j) * 4]
imul tmpd, 2 * 8
vinsertf128 m %+ i, m %+ i, [inq + tmpq], j
%assign j j+1
%endrep
%assign i i+1
%endrep
FFT8 m0, m1, m2, m3, m8, m9, m10
vpermilpd m4, m0, 5
vpermilpd m5, m1, 5
vpermilpd m6, m2, 5
vpermilpd m7, m3, 5
vbroadcastsd m8, [cos_32_double + 0 * 8] ; t1[0]
vbroadcastsd m9, [cos_32_double + 2 * 8] ; t1[2]
vbroadcastsd m10, [cos_32_double + 4 * 8] ; t1[4]
vbroadcastsd m11, [cos_32_double + 6 * 8] ; t1[6]
mulpd m4, m8
mulpd m6, m9
mulpd m5, m10
mulpd m7, m11
vbroadcastsd m8, [cos_32_double + 8 * 8] ; t2[7]
vbroadcastsd m9, [cos_32_double + 6 * 8] ; t2[5]
vbroadcastsd m10, [cos_32_double + 4 * 8] ; t2[3]
vbroadcastsd m11, [cos_32_double + 2 * 8] ; t2[1]
mulpd m0, m8
mulpd m2, m9
mulpd m1, m10
mulpd m3, m11
xorpd m0, [mask_mppm]
xorpd m1, [mask_mppm]
xorpd m2, [mask_mppm]
xorpd m3, [mask_mppm]
addpd m0, m4 ; w01 j01
addpd m1, m5 ; w45 j45
addpd m2, m6 ; w23 j23
addpd m3, m7 ; w67 j67
vperm2f128 m4, m0, m2, 0x31 ; j0123
vperm2f128 m5, m1, m3, 0x31 ; j4567
vinsertf128 m0, m0, xmm2, 1 ; w0123
vinsertf128 m1, m1, xmm3, 1 ; w4567
addpd m2, m4, m0 ; t1032
addpd m3, m5, m1 ; t5476
vpermilpd m2, m2, 0b0101 ; t0123
vpermilpd m3, m3, 0b0101 ; t4567
subpd m6, m0, m4
subpd m7, m1, m5
xorpd m6, [mask_pmpm] ; r0123
xorpd m7, [mask_pmpm] ; r4567
%assign i 8
%rep 8
%assign j 0
%rep 2
mov tmpd, [lutq + ((i - 8) * 2 + j) * 4]
imul tmpd, 2 * 8
vinsertf128 m %+ i, m %+ i, [inq + tmpq], j
%assign j j+1
%endrep
%assign i i+1
%endrep
FFT16 m8, m9, m10, m11, m12, m13, m14, m15, m1, m2, m4, m5
movapd [outq + 0 * 4 * 8], m2
movapd [outq + 1 * 4 * 8], m3
movapd [outq + 2 * 4 * 8], m6
movapd [outq + 3 * 4 * 8], m7
RET
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment