Last active
August 20, 2021 19:47
libavutil tx x86 assembly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;****************************************************************************** | |
;* This file is part of FFmpeg. | |
;* | |
;* FFmpeg is free software; you can redistribute it and/or | |
;* modify it under the terms of the GNU Lesser General Public | |
;* License as published by the Free Software Foundation; either | |
;* version 2.1 of the License, or (at your option) any later version. | |
;* | |
;* FFmpeg is distributed in the hope that it will be useful, | |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
;* Lesser General Public License for more details. | |
;* | |
;* You should have received a copy of the GNU Lesser General Public | |
;* License along with FFmpeg; if not, write to the Free Software | |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
;****************************************************************************** | |
%include "x86util.asm" | |
%if ARCH_X86_64 | |
%define ptr resq | |
%else | |
%define ptr resd | |
%endif | |
%assign i 16 | |
%rep 14 | |
cextern cos_ %+ i %+ _double ; ff_cos_i_double... | |
%assign i (i << 1) | |
%endrep | |
struc AVTXContext | |
.n: resd 1 ; Non-power-of-two part | |
.m: resd 1 ; Power-of-two part | |
.inv: resd 1 ; Is inverse | |
.type: resd 1 ; Type | |
.flags: resq 1 ; Flags | |
.scale: resq 1 ; Scale | |
.exptab: ptr 1 ; MDCT exptab | |
.tmp: ptr 1 ; Temporary buffer needed for all compound transforms | |
.pfatab: ptr 1 ; Input/Output mapping for compound transforms | |
.revtab: ptr 1 ; Input mapping for power of two transforms | |
.inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms | |
.arch_priv: ptr 1 ; Arch-specific private data, will be av_free()'d on exit | |
endstruc | |
SECTION_RODATA 32 | |
%define POS 0x0000000000000000 | |
%define NEG 0x8000000000000000 | |
%define M_SQRT1_2 0.707106781186547524401 | |
%define COS16_1 0.92387950420379638671875 | |
%define COS16_3 0.3826834261417388916015625 | |
mask_ppmm: dq POS, POS, NEG, NEG | |
mask_pppm: dq POS, POS, POS, NEG | |
mask_pmpm: dq POS, NEG, POS, NEG | |
mask_pmpp: dq POS, NEG, POS, POS | |
mask_mppm: dq NEG, POS, POS, NEG | |
mult_sqrt: dq M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 | |
mult_even1: dq 1.0, 1.0, M_SQRT1_2, M_SQRT1_2 | |
mult_even2: dq 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 | |
mult_odd11: dq COS16_1, COS16_1, COS16_3, COS16_3 | |
mult_odd12: dq COS16_1, -COS16_1, COS16_3, -COS16_3 | |
mult_odd21: dq COS16_3, -COS16_3, COS16_1, -COS16_1 | |
mult_odd22: dq -COS16_3, -COS16_3, -COS16_1, -COS16_1 | |
SECTION .text | |
; %1 - in, out | |
; %2 - in, out | |
; %3 - tmp | |
%macro FFT4 3 | |
subpd %3, %1, %2 | |
addpd %1, %2 | |
vinsertf128 %2, %1, xmm%3, 1 | |
vpermpd %3, %3, q3223 | |
vpermpd %1, %1, q3232 | |
vinsertf128 %1, %1, xmm%3, 1 | |
subpd %3, %2, %1 | |
addpd %1, %2 | |
vextractf128 xmm%2, %1, 1 | |
vinsertf128 %1, %1, xmm%3, 1 | |
vinsertf128 %3, %3, xmm%2, 0 | |
vpermpd %2, %3, q1230 | |
%endmacro | |
; %1 - in, out | |
; %2 - in, out | |
; %3 - in, out | |
; %4 - in, out | |
; %5 - tmp | |
; %6 - tmp | |
; %7 - tmp | |
%macro FFT8 7 | |
subpd %5, %1, %3 ; r1234 | |
subpd %6, %2, %4 ; r5678 | |
addpd %1, %3 ; q1234 | |
addpd %2, %4 ; q5678 | |
movapd %7, [mask_ppmm] | |
vinsertf128 %3, %1, xmm%1, 1 ; q1212 | |
vinsertf128 %4, %2, xmm%2, 1 ; q5656 | |
vpermpd %1, %1, q3232 ; q3434 | |
vpermpd %2, %2, q3232 ; q7878 | |
xorpd %1, %7 | |
xorpd %2, %7 | |
addpd %3, %1 ; s1234 | |
addpd %4, %2 ; s5678 | |
shufpd %4, %4, q0012 ; s5687 | |
xorpd %4, [mask_pppm] | |
addpd %1, %3, %4 ; q1234 | |
subpd %2, %3, %4 ; q5678 | |
movapd %7, [mask_pmpm] | |
vpermpd %3, %5, q0220 | |
vpermpd %4, %6, q0220 | |
vpermpd %5, %5, q3113 | |
vpermpd %6, %6, q1331 | |
xorpd %5, %7 | |
xorpd %6, %7 | |
addpd %5, %3 ; z1234 | |
addpd %6, %4 ; z5678 | |
vpermpd %4, %6, q0330 | |
vpermpd %6, %6, q1221 | |
xorpd %4, %7 | |
addpd %6, %4 ; t5678 | |
shufpd %5, %5, q0012 | |
xorpd %3, %5, [mask_pmpp] ; z1243 | |
movapd %4, %3 | |
mulpd %6, [mult_sqrt] | |
addpd %3, %6 | |
subpd %4, %6 | |
%endmacro | |
; %1 - in, out | |
; %2 - in, out | |
; %3 - in, out | |
; %4 - in, out | |
; %5 - in, out | |
; %6 - in, out | |
; %7 - in, out | |
; %8 - in, out | |
; %9 - tmp | |
; %10 - tmp | |
; %11 - tmp | |
; %12 - tmp | |
%macro FFT16 12 | |
FFT8 %1, %2, %3, %4, %9, %10, %11 | |
FFT4 %5, %7, %9 | |
FFT4 %6, %8, %9 | |
vperm2f128 %9, %5, %5, q0120 | |
vperm2f128 %10, %6, %6, q0120 | |
vpermilpd %9, %9, 0b0101 | |
vpermilpd %10, %10, 0b0101 | |
xorpd %9, [mask_ppmm] | |
addsubpd %5, %9 | |
addsubpd %6, %10 | |
vpermilpd %9, %7, 0b0101 | |
vpermilpd %10, %8, 0b0101 | |
mulpd %7, [mult_odd11] | |
mulpd %8, [mult_odd12] | |
%if cpuflag(fma3) | |
fmaddpd %7, %9, [mult_odd21], %7 | |
fmaddpd %8, %10, [mult_odd22], %8 | |
%else | |
mulpd %9, [mult_odd21] | |
mulpd %10, [mult_odd22] | |
addpd %7, %9 | |
addpd %8, %10 | |
%endif | |
mulpd %5, [mult_even1] | |
mulpd %6, [mult_even2] | |
xorpd %10, %6, [mask_pmpm] | |
xorpd %12, %8, [mask_pmpm] | |
addsubpd %6, %5 | |
addsubpd %8, %7 | |
addpd %9, %5, %10 | |
addpd %11, %7, %12 | |
vpermilpd %10, %6, 0b0101 | |
vpermilpd %12, %8, 0b0101 | |
addpd %5, %3, %11 | |
addpd %6, %4, %12 | |
subpd %7, %3, %11 | |
subpd %8, %4, %12 | |
subpd %3, %1, %9 | |
subpd %4, %2, %10 | |
addpd %1, %9 | |
addpd %2, %10 | |
%endmacro | |
%macro SPLIT_RADIX_COMBINE 0 | |
%endmacro | |
%macro LOAD_LUT 1 | |
%assign i 0 | |
%rep %1 / 2 | |
%assign j 0 | |
%rep 2 | |
mov tmpd, [lutq + (i * 2 + j) * 4] | |
imul tmpd, 2 * 8 | |
vinsertf128 m %+ i, m %+ i, [inq + tmpq], j | |
%assign j j+1 | |
%endrep | |
%assign i i+1 | |
%endrep | |
%endmacro | |
%macro EXTRACT_OUT 1 | |
%assign i 0 | |
%rep 2 | |
%assign j 0 | |
%rep %1 / 4 | |
%assign k 0 | |
%rep 2 | |
%assign l j + i * (%1 / 4) | |
vextractf128 [outq + (2 * j + k) * 4 * 8 + i * 2 * 8], m %+ l, k | |
%assign k k+1 | |
%endrep | |
%assign j j+1 | |
%endrep | |
%assign i i+1 | |
%endrep | |
%endmacro | |
INIT_YMM avx | |
cglobal fft4_double, 4, 4, 3, lut, out, in, tmp | |
mov lutq, [lutq + AVTXContext.revtab] | |
LOAD_LUT 4 | |
FFT4 m0, m1, m2 | |
EXTRACT_OUT 4 | |
RET | |
INIT_YMM avx | |
cglobal fft8_double, 4, 4, 7, lut, out, in, tmp | |
mov lutq, [lutq + AVTXContext.revtab] | |
LOAD_LUT 8 | |
FFT8 m0, m1, m2, m3, m4, m5, m6 | |
EXTRACT_OUT 8 | |
RET | |
%macro FFT16_FN 1 | |
INIT_YMM %1 | |
cglobal fft16_double, 4, 4, 12, lut, out, in, tmp | |
mov lutq, [lutq + AVTXContext.revtab] | |
LOAD_LUT 16 | |
FFT16 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11 | |
EXTRACT_OUT 16 | |
RET | |
%endmacro | |
FFT16_FN avx | |
FFT16_FN fma3 | |
INIT_YMM avx | |
cglobal fft32_double, 4, 4, 16, lut, out, in, tmp | |
mov lutq, [lutq + AVTXContext.revtab] | |
%assign i 0 | |
%rep 4 | |
%assign j 0 | |
%rep 2 | |
mov tmpd, [lutq + ((i + 8) * 2 + j) * 4] | |
imul tmpd, 2 * 8 | |
vinsertf128 m %+ i, m %+ i, [inq + tmpq], j | |
%assign j j+1 | |
%endrep | |
%assign i i+1 | |
%endrep | |
FFT8 m0, m1, m2, m3, m8, m9, m10 | |
vpermilpd m4, m0, 5 | |
vpermilpd m5, m1, 5 | |
vpermilpd m6, m2, 5 | |
vpermilpd m7, m3, 5 | |
vbroadcastsd m8, [cos_32_double + 0 * 8] ; t1[0] | |
vbroadcastsd m9, [cos_32_double + 2 * 8] ; t1[2] | |
vbroadcastsd m10, [cos_32_double + 4 * 8] ; t1[4] | |
vbroadcastsd m11, [cos_32_double + 6 * 8] ; t1[6] | |
mulpd m4, m8 | |
mulpd m6, m9 | |
mulpd m5, m10 | |
mulpd m7, m11 | |
vbroadcastsd m8, [cos_32_double + 8 * 8] ; t2[7] | |
vbroadcastsd m9, [cos_32_double + 6 * 8] ; t2[5] | |
vbroadcastsd m10, [cos_32_double + 4 * 8] ; t2[3] | |
vbroadcastsd m11, [cos_32_double + 2 * 8] ; t2[1] | |
mulpd m0, m8 | |
mulpd m2, m9 | |
mulpd m1, m10 | |
mulpd m3, m11 | |
xorpd m0, [mask_mppm] | |
xorpd m1, [mask_mppm] | |
xorpd m2, [mask_mppm] | |
xorpd m3, [mask_mppm] | |
addpd m0, m4 ; w01 j01 | |
addpd m1, m5 ; w45 j45 | |
addpd m2, m6 ; w23 j23 | |
addpd m3, m7 ; w67 j67 | |
vperm2f128 m4, m0, m2, 0x31 ; j0123 | |
vperm2f128 m5, m1, m3, 0x31 ; j4567 | |
vinsertf128 m0, m0, xmm2, 1 ; w0123 | |
vinsertf128 m1, m1, xmm3, 1 ; w4567 | |
addpd m2, m4, m0 ; t1032 | |
addpd m3, m5, m1 ; t5476 | |
vpermilpd m2, m2, 0b0101 ; t0123 | |
vpermilpd m3, m3, 0b0101 ; t4567 | |
subpd m6, m0, m4 | |
subpd m7, m1, m5 | |
xorpd m6, [mask_pmpm] ; r0123 | |
xorpd m7, [mask_pmpm] ; r4567 | |
%assign i 8 | |
%rep 8 | |
%assign j 0 | |
%rep 2 | |
mov tmpd, [lutq + ((i - 8) * 2 + j) * 4] | |
imul tmpd, 2 * 8 | |
vinsertf128 m %+ i, m %+ i, [inq + tmpq], j | |
%assign j j+1 | |
%endrep | |
%assign i i+1 | |
%endrep | |
FFT16 m8, m9, m10, m11, m12, m13, m14, m15, m1, m2, m4, m5 | |
movapd [outq + 0 * 4 * 8], m2 | |
movapd [outq + 1 * 4 * 8], m3 | |
movapd [outq + 2 * 4 * 8], m6 | |
movapd [outq + 3 * 4 * 8], m7 | |
RET |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment