Skip to content

Instantly share code, notes, and snippets.

@jeapostrophe
Created October 29, 2015 23:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeapostrophe/d54d3a6a871e5127a6ed to your computer and use it in GitHub Desktop.
Save jeapostrophe/d54d3a6a871e5127a6ed to your computer and use it in GitHub Desktop.
Vectorizing structure reads, writes, etc examples
; External declaration of the puts function
declare i32 @exit() nounwind
; float = 1, 2, 3, 4, 5, 6, 7, 8
; i16 = 2, 4, 6, 8, 10, 12, 13, 14, 15, 16
; i8 = 4, 8, 12, 16, 20, 24, 26, 28, 29, 30, 31, 32
%athing = type { float, float, float, float, float, float, i16, i16, i8, i8, i8, i8 }
@one = external global %athing
@two = external global %athing
define i32 @main() {
; Make a one one
store %athing { float 5.0, float 6.0,
float 1.0, float 2.0,
float 3.0, float 4.0,
i16 25, i16 15,
i8 1,
i8 2, i8 3, i8 4 }, %athing* @one
call i32 @exit()
; RESULT: Very slow with 13 memory stores
; Load a two one from somewhere else
%a = load %athing* @two
store %athing %a, %athing* @one
call i32 @exit()
; RESULT: 6 vector loads, then a lot of parsing, then 12 memory stores, some as vectors
; See what the size of a sprite is
%gp = alloca %athing
%g = load %athing* %gp
store %athing %g, %athing* @one
; RESULT %rsp has 40 added to it, but we start at %rsp+8 for alignment
; Try to make it use a wide vector op (exchange)
%two_vector = bitcast %athing* @two to <8 x float>*
%b = load <8 x float>* %two_vector
%one_vector = bitcast %athing* @one to <8 x float>*
store <8 x float> %b, <8 x float>* %one_vector
call i32 @exit()
; RESULT: 1 vector load and 1 vector store
; Try to make it use a wide vector op, changing one of the floats
%cv_orig = load <8 x float>* %two_vector
%c_dx_orig = extractelement <8 x float> %cv_orig, i32 0
%c_dx_change = fadd float %c_dx_orig, 1.0
%cv_change = insertelement <8 x float> %cv_orig, float %c_dx_change, i32 0
store <8 x float> %cv_change, <8 x float>* %one_vector
call i32 @exit()
; RESULT: 1 vector load, 1 vector load of the constant, an addition, then a single vector store
; Try to use a vector op to change the shorts and bytes
%dv_orig = load <8 x float>* %two_vector
;; change the float
%d_dx_orig = extractelement <8 x float> %dv_orig, i32 0
%d_dx_change = fadd float %d_dx_orig, 1.0
%dv_change0 = insertelement <8 x float> %dv_orig, float %d_dx_change, i32 0
;; change the short
%dv_change0_s = bitcast <8 x float> %dv_change0 to <16 x i16>
%dv_change1_s = insertelement <16 x i16> %dv_change0_s, i16 5, i32 14
%dv_change1 = bitcast <16 x i16> %dv_change1_s to <8 x float>
;; change a byte
%dv_change1_b = bitcast <8 x float> %dv_change1 to <32 x i8>
%dv_change2_b = insertelement <32 x i8> %dv_change1_b, i8 66, i32 29
%dv_change2 = bitcast <32 x i8> %dv_change2_b to <8 x float>
store <8 x float> %dv_change2, <8 x float>* %one_vector
call i32 @exit()
; RESULT: virtually equivalent to the above, but with some more vector manipulations
; Make a one one using vector ops
%es_f = bitcast <2 x i16> <i16 25, i16 15> to float
%ev_fs = insertelement <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 >, float %es_f, i32 6
%eb_f = bitcast <4 x i8> <i8 1, i8 2, i8 3, i8 4> to float
%ev_fsb = insertelement <8 x float> %ev_fs, float %eb_f, i32 7
store <8 x float> %ev_fsb, <8 x float>* %one_vector
call i32 @exit()
; RESULT: lots of complicated constants loaded from memory before a single write
; Make a one one using bitcasting vector ops
%fv_fs_pre = bitcast <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 > to <16 x i16>
%fv_fs_post0 = insertelement <16 x i16> %fv_fs_pre, i16 25, i8 13
%fv_fs_post1 = insertelement <16 x i16> %fv_fs_post0, i16 15, i8 14
%fv_fsb_pre = bitcast <16 x i16> %fv_fs_post1 to <32 x i8>
%fv_fsb_post0 = insertelement <32 x i8> %fv_fsb_pre, i8 1, i8 28
%fv_fsb_post1 = insertelement <32 x i8> %fv_fsb_post0, i8 2, i8 29
%fv_fsb_post2 = insertelement <32 x i8> %fv_fsb_post1, i8 3, i8 30
%fv_fsb_post3 = insertelement <32 x i8> %fv_fsb_post2, i8 4, i8 31
%fv_fsb = bitcast <32 x i8> %fv_fsb_post3 to <8 x float>
store <8 x float> %fv_fsb, <8 x float>* %one_vector
call i32 @exit()
; RESULTS: one constant for the first four floats, then a trivial sequence of inserts, then a single write
ret i32 0
}
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 14, 5
.section __TEXT,__literal4,4byte_literals
.align 2
LCPI0_0:
.long 1065353216 ## float 1
LCPI0_3:
.long 1082130432 ## float 4
LCPI0_4:
.long 1077936128 ## float 3
.section __TEXT,__literal16,16byte_literals
.align 4
LCPI0_1:
.quad 25 ## 0x19
.quad 15 ## 0xf
LCPI0_2:
.byte 0 ## 0x0
.byte 1 ## 0x1
.byte 8 ## 0x8
.byte 9 ## 0x9
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
LCPI0_6:
.long 1 ## 0x1
.long 2 ## 0x2
.long 3 ## 0x3
.long 4 ## 0x4
LCPI0_7:
.byte 0 ## 0x0
.byte 4 ## 0x4
.byte 8 ## 0x8
.byte 12 ## 0xc
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.byte 128 ## 0x80
.section __TEXT,__const
.align 5
LCPI0_5:
.long 1084227584 ## float 5.000000e+00
.long 1086324736 ## float 6.000000e+00
.long 1065353216 ## float 1.000000e+00
.long 1073741824 ## float 2.000000e+00
.long 1077936128 ## float 3.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 0 ## float 0.000000e+00
.long 0 ## float 0.000000e+00
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %r14
Ltmp0:
.cfi_def_cfa_offset 16
pushq %rbx
Ltmp1:
.cfi_def_cfa_offset 24
subq $40, %rsp
Ltmp2:
.cfi_def_cfa_offset 64
Ltmp3:
.cfi_offset %rbx, -24
Ltmp4:
.cfi_offset %r14, -16
movq _one@GOTPCREL(%rip), %rbx
movb $4, 31(%rbx)
movb $3, 30(%rbx)
movb $2, 29(%rbx)
movb $1, 28(%rbx)
movw $15, 26(%rbx)
movw $25, 24(%rbx)
movl $1082130432, 20(%rbx) ## imm = 0x40800000
movl $1077936128, 16(%rbx) ## imm = 0x40400000
movl $1073741824, 12(%rbx) ## imm = 0x40000000
movl $1065353216, 8(%rbx) ## imm = 0x3F800000
movl $1086324736, 4(%rbx) ## imm = 0x40C00000
movl $1084227584, (%rbx) ## imm = 0x40A00000
callq _exit
movq _two@GOTPCREL(%rip), %r14
vmovss (%r14), %xmm0
vmovss 4(%r14), %xmm1
vmovss 8(%r14), %xmm2
vmovss 12(%r14), %xmm3
vmovss 16(%r14), %xmm4
vmovss 20(%r14), %xmm5
movw 24(%r14), %si
movw 26(%r14), %di
movb 28(%r14), %r8b
movb 29(%r14), %al
movb 30(%r14), %cl
movb 31(%r14), %dl
movb %dl, 31(%rbx)
movb %cl, 30(%rbx)
movb %al, 29(%rbx)
movb %r8b, 28(%rbx)
movw %di, 26(%rbx)
movw %si, 24(%rbx)
vmovss %xmm5, 20(%rbx)
vmovss %xmm4, 16(%rbx)
vmovss %xmm3, 12(%rbx)
vmovss %xmm2, 8(%rbx)
vmovss %xmm1, 4(%rbx)
vmovss %xmm0, (%rbx)
callq _exit
vmovss 8(%rsp), %xmm0
vmovss 12(%rsp), %xmm1
vmovss 16(%rsp), %xmm2
vmovss 20(%rsp), %xmm3
vmovss 24(%rsp), %xmm4
vmovss 28(%rsp), %xmm5
movw 32(%rsp), %si
movw 34(%rsp), %di
movb 36(%rsp), %r8b
movb 37(%rsp), %al
movb 38(%rsp), %cl
movb 39(%rsp), %dl
movb %dl, 31(%rbx)
movb %cl, 30(%rbx)
movb %al, 29(%rbx)
movb %r8b, 28(%rbx)
movw %di, 26(%rbx)
movw %si, 24(%rbx)
vmovss %xmm5, 20(%rbx)
vmovss %xmm4, 16(%rbx)
vmovss %xmm3, 12(%rbx)
vmovss %xmm2, 8(%rbx)
vmovss %xmm1, 4(%rbx)
vmovss %xmm0, (%rbx)
vmovaps (%r14), %ymm0
vmovaps %ymm0, (%rbx)
vzeroupper
callq _exit
vmovaps (%r14), %ymm0
vmovss LCPI0_0(%rip), %xmm1
vaddss %xmm1, %xmm0, %xmm1
vinsertf128 $0, %xmm1, %ymm0, %ymm0
vmovaps %ymm0, (%rbx)
vzeroupper
callq _exit
vmovaps (%r14), %ymm0
vmovss LCPI0_0(%rip), %xmm1
vaddss %xmm1, %xmm0, %xmm1
vinsertf128 $0, %xmm1, %ymm0, %ymm1
vextractf128 $1, %ymm0, %xmm0
movl $5, %eax
vpinsrw $6, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm1
movl $66, %eax
vpinsrb $13, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vmovaps %ymm0, (%rbx)
vzeroupper
callq _exit
vmovdqa LCPI0_1(%rip), %xmm0
vpshufb LCPI0_2(%rip), %xmm0, %xmm0
vmovd %xmm0, (%rsp)
vmovss LCPI0_3(%rip), %xmm0
vmovss LCPI0_4(%rip), %xmm1
vunpcklps %xmm0, %xmm1, %xmm0 ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
vmovq %xmm0, %xmm0
vinsertps $32, (%rsp), %xmm0, %xmm0
vmovaps LCPI0_5(%rip), %ymm1
vinsertf128 $1, %xmm0, %ymm1, %ymm1
vmovdqa LCPI0_6(%rip), %xmm2
vpshufb LCPI0_7(%rip), %xmm2, %xmm2
vmovd %xmm2, 4(%rsp)
vinsertps $48, 4(%rsp), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vmovaps %ymm0, (%rbx)
vzeroupper
callq _exit
vmovaps LCPI0_5(%rip), %ymm1
vextractf128 $1, %ymm1, %xmm0
movl $25, %eax
vpinsrw $5, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm1
movl $15, %eax
vpinsrw $6, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm1
movl $1, %eax
vpinsrb $12, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm1
movl $2, %eax
vpinsrb $13, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm1
movl $3, %eax
vpinsrb $14, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm1
movl $4, %eax
vpinsrb $15, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vmovaps %ymm0, (%rbx)
vzeroupper
callq _exit
xorl %eax, %eax
addq $40, %rsp
popq %rbx
popq %r14
retq
.cfi_endproc
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment