Created
September 17, 2018 00:52
-
-
Save lemire/d2047ce1e3b511c54bb47b779a3028f5 to your computer and use it in GitHub Desktop.
assembly from C program at https://lemire.me/blog/2018/03/24/when-shuffling-large-arrays-how-much-time-can-be-attributed-to-random-number-generation/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.macosx_version_min 10, 13 | |
.section __TEXT,__literal16,16byte_literals | |
.p2align 4 ## -- Begin function _Z9timestampv | |
LCPI0_0: | |
.long 1127219200 ## 0x43300000 | |
.long 1160773632 ## 0x45300000 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
LCPI0_1: | |
.quad 4841369599423283200 ## double 4503599627370496 | |
.quad 4985484787499139072 ## double 1.9342813113834067E+25 | |
.section __TEXT,__literal8,8byte_literals | |
.p2align 3 | |
LCPI0_2: | |
.quad 4696837146684686336 ## double 1.0E+6 | |
.section __TEXT,__text,regular,pure_instructions | |
.globl __Z9timestampv | |
.p2align 4, 0x90 | |
__Z9timestampv: ## @_Z9timestampv | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi0: | |
.cfi_def_cfa_offset 16 | |
Lcfi1: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi2: | |
.cfi_def_cfa_register %rbp | |
callq _clock | |
movq %rax, %xmm0 | |
punpckldq LCPI0_0(%rip), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] | |
subpd LCPI0_1(%rip), %xmm0 | |
haddpd %xmm0, %xmm0 | |
divsd LCPI0_2(%rip), %xmm0 | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.globl __Z14pcg32_random_rP14pcg32_random_t ## -- Begin function _Z14pcg32_random_rP14pcg32_random_t | |
.p2align 4, 0x90 | |
__Z14pcg32_random_rP14pcg32_random_t: ## @_Z14pcg32_random_rP14pcg32_random_t | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi3: | |
.cfi_def_cfa_offset 16 | |
Lcfi4: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi5: | |
.cfi_def_cfa_register %rbp | |
movq (%rdi), %rcx | |
movq 8(%rdi), %rax | |
movabsq $6364136223846793005, %rdx ## imm = 0x5851F42D4C957F2D | |
imulq %rcx, %rdx | |
orq $1, %rax | |
addq %rdx, %rax | |
movq %rax, (%rdi) | |
movq %rcx, %rax | |
shrq $18, %rax | |
xorq %rcx, %rax | |
shrq $27, %rax | |
shrq $59, %rcx | |
movl %eax, %edx | |
shrl %cl, %edx | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %eax | |
orl %edx, %eax | |
## kill: %EAX<def> %EAX<kill> %RAX<kill> | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.globl __Z14random_boundedP14pcg32_random_tj ## -- Begin function _Z14random_boundedP14pcg32_random_tj | |
.p2align 4, 0x90 | |
__Z14random_boundedP14pcg32_random_tj: ## @_Z14random_boundedP14pcg32_random_tj | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi6: | |
.cfi_def_cfa_offset 16 | |
Lcfi7: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi8: | |
.cfi_def_cfa_register %rbp | |
movq (%rdi), %rcx | |
movq 8(%rdi), %rax | |
movabsq $6364136223846793005, %rdx ## imm = 0x5851F42D4C957F2D | |
imulq %rcx, %rdx | |
orq $1, %rax | |
addq %rdx, %rax | |
movq %rax, (%rdi) | |
movq %rcx, %rdx | |
shrq $18, %rdx | |
xorq %rcx, %rdx | |
shrq $27, %rdx | |
shrq $59, %rcx | |
movl %edx, %eax | |
shrl %cl, %eax | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %edx | |
orl %eax, %edx | |
movl %esi, %eax | |
imulq %rdx, %rax | |
shrq $32, %rax | |
## kill: %EAX<def> %EAX<kill> %RAX<kill> | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.globl __Z4swapPiS_ ## -- Begin function _Z4swapPiS_ | |
.p2align 4, 0x90 | |
__Z4swapPiS_: ## @_Z4swapPiS_ | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi9: | |
.cfi_def_cfa_offset 16 | |
Lcfi10: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi11: | |
.cfi_def_cfa_register %rbp | |
movl (%rdi), %eax | |
movl (%rsi), %ecx | |
movl %ecx, (%rdi) | |
movl %eax, (%rsi) | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.globl __Z21java_random_bounded32P14pcg32_random_tj ## -- Begin function _Z21java_random_bounded32P14pcg32_random_tj | |
.p2align 4, 0x90 | |
__Z21java_random_bounded32P14pcg32_random_tj: ## @_Z21java_random_bounded32P14pcg32_random_tj | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi12: | |
.cfi_def_cfa_offset 16 | |
Lcfi13: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi14: | |
.cfi_def_cfa_register %rbp | |
pushq %rbx | |
Lcfi15: | |
.cfi_offset %rbx, -24 | |
movq (%rdi), %r9 | |
movq 8(%rdi), %r10 | |
orq $1, %r10 | |
movl %esi, %r11d | |
negl %r11d | |
movabsq $6364136223846793005, %rbx ## imm = 0x5851F42D4C957F2D | |
.p2align 4, 0x90 | |
LBB4_1: ## =>This Inner Loop Header: Depth=1 | |
movq %r9, %rcx | |
imulq %rbx, %r9 | |
addq %r10, %r9 | |
movq %rcx, %r8 | |
shrq $18, %r8 | |
xorq %rcx, %r8 | |
shrq $27, %r8 | |
shrq $59, %rcx | |
movl %r8d, %eax | |
shrl %cl, %eax | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %r8d | |
orl %eax, %r8d | |
xorl %edx, %edx | |
movl %r8d, %eax | |
divl %esi | |
subl %edx, %r8d | |
cmpl %r11d, %r8d | |
ja LBB4_1 | |
## BB#2: | |
movq %r9, (%rdi) | |
movl %edx, %eax | |
popq %rbx | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.globl __Z12shuffle_javaPim ## -- Begin function _Z12shuffle_javaPim | |
.p2align 4, 0x90 | |
__Z12shuffle_javaPim: ## @_Z12shuffle_javaPim | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi16: | |
.cfi_def_cfa_offset 16 | |
Lcfi17: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi18: | |
.cfi_def_cfa_register %rbp | |
pushq %r14 | |
pushq %rbx | |
Lcfi19: | |
.cfi_offset %rbx, -32 | |
Lcfi20: | |
.cfi_offset %r14, -24 | |
cmpl $2, %esi | |
jl LBB5_5 | |
## BB#1: | |
movslq %esi, %r8 | |
movabsq $-8846114313915602277, %r9 ## imm = 0x853C49E6748FEA9B | |
movabsq $6364136223846793005, %r10 ## imm = 0x5851F42D4C957F2D | |
movabsq $-2720673578348880933, %r11 ## imm = 0xDA3E39CB94B95BDB | |
.p2align 4, 0x90 | |
LBB5_2: ## =>This Loop Header: Depth=1 | |
## Child Loop BB5_3 Depth 2 | |
leaq -1(%r8), %r14 | |
movl %r8d, %ebx | |
negl %ebx | |
.p2align 4, 0x90 | |
LBB5_3: ## Parent Loop BB5_2 Depth=1 | |
## => This Inner Loop Header: Depth=2 | |
movq %r9, %rcx | |
imulq %r10, %r9 | |
addq %r11, %r9 | |
movq %rcx, %rsi | |
shrq $18, %rsi | |
xorq %rcx, %rsi | |
shrq $27, %rsi | |
shrq $59, %rcx | |
movl %esi, %eax | |
shrl %cl, %eax | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %esi | |
orl %eax, %esi | |
xorl %edx, %edx | |
movl %esi, %eax | |
divl %r8d | |
subl %edx, %esi | |
cmpl %ebx, %esi | |
ja LBB5_3 | |
## BB#4: ## in Loop: Header=BB5_2 Depth=1 | |
movl %edx, %eax | |
movl -4(%rdi,%r8,4), %ecx | |
movl (%rdi,%rax,4), %edx | |
movl %edx, -4(%rdi,%r8,4) | |
movl %ecx, (%rdi,%rax,4) | |
cmpq $2, %r8 | |
movq %r14, %r8 | |
jg LBB5_2 | |
LBB5_5: | |
popq %rbx | |
popq %r14 | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.globl __Z15shuffle_precompPimS_ ## -- Begin function _Z15shuffle_precompPimS_ | |
.p2align 4, 0x90 | |
__Z15shuffle_precompPimS_: ## @_Z15shuffle_precompPimS_ | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi21: | |
.cfi_def_cfa_offset 16 | |
Lcfi22: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi23: | |
.cfi_def_cfa_register %rbp | |
cmpl $2, %esi | |
jl LBB6_3 | |
## BB#1: | |
movslq %esi, %rax | |
incq %rax | |
.p2align 4, 0x90 | |
LBB6_2: ## =>This Inner Loop Header: Depth=1 | |
movslq -4(%rdx,%rax,4), %rcx | |
movl -8(%rdi,%rax,4), %r8d | |
movl (%rdi,%rcx,4), %esi | |
movl %esi, -8(%rdi,%rax,4) | |
movl %r8d, (%rdi,%rcx,4) | |
decq %rax | |
cmpq $2, %rax | |
jg LBB6_2 | |
LBB6_3: | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.section __TEXT,__literal16,16byte_literals | |
.p2align 4 ## -- Begin function main | |
LCPI7_0: | |
.long 0 ## 0x0 | |
.long 1 ## 0x1 | |
.long 2 ## 0x2 | |
.long 3 ## 0x3 | |
LCPI7_1: | |
.long 4 ## 0x4 | |
.long 4 ## 0x4 | |
.long 4 ## 0x4 | |
.long 4 ## 0x4 | |
LCPI7_2: | |
.long 8 ## 0x8 | |
.long 8 ## 0x8 | |
.long 8 ## 0x8 | |
.long 8 ## 0x8 | |
LCPI7_3: | |
.long 12 ## 0xc | |
.long 12 ## 0xc | |
.long 12 ## 0xc | |
.long 12 ## 0xc | |
LCPI7_4: | |
.long 16 ## 0x10 | |
.long 16 ## 0x10 | |
.long 16 ## 0x10 | |
.long 16 ## 0x10 | |
LCPI7_5: | |
.long 20 ## 0x14 | |
.long 20 ## 0x14 | |
.long 20 ## 0x14 | |
.long 20 ## 0x14 | |
LCPI7_6: | |
.long 24 ## 0x18 | |
.long 24 ## 0x18 | |
.long 24 ## 0x18 | |
.long 24 ## 0x18 | |
LCPI7_7: | |
.long 28 ## 0x1c | |
.long 28 ## 0x1c | |
.long 28 ## 0x1c | |
.long 28 ## 0x1c | |
LCPI7_8: | |
.long 32 ## 0x20 | |
.long 32 ## 0x20 | |
.long 32 ## 0x20 | |
.long 32 ## 0x20 | |
LCPI7_9: | |
.long 36 ## 0x24 | |
.long 36 ## 0x24 | |
.long 36 ## 0x24 | |
.long 36 ## 0x24 | |
LCPI7_10: | |
.long 40 ## 0x28 | |
.long 40 ## 0x28 | |
.long 40 ## 0x28 | |
.long 40 ## 0x28 | |
LCPI7_11: | |
.long 1127219200 ## 0x43300000 | |
.long 1160773632 ## 0x45300000 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
LCPI7_12: | |
.quad 4841369599423283200 ## double 4503599627370496 | |
.quad 4985484787499139072 ## double 1.9342813113834067E+25 | |
.section __TEXT,__literal8,8byte_literals | |
.p2align 3 | |
LCPI7_13: | |
.quad 4696837146684686336 ## double 1.0E+6 | |
.section __TEXT,__text,regular,pure_instructions | |
.globl _main | |
.p2align 4, 0x90 | |
_main: ## @main | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi24: | |
.cfi_def_cfa_offset 16 | |
Lcfi25: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi26: | |
.cfi_def_cfa_register %rbp | |
pushq %r15 | |
pushq %r14 | |
pushq %r13 | |
pushq %r12 | |
pushq %rbx | |
subq $40, %rsp | |
Lcfi27: | |
.cfi_offset %rbx, -56 | |
Lcfi28: | |
.cfi_offset %r12, -48 | |
Lcfi29: | |
.cfi_offset %r13, -40 | |
Lcfi30: | |
.cfi_offset %r14, -32 | |
Lcfi31: | |
.cfi_offset %r15, -24 | |
movl $400000000, %edi ## imm = 0x17D78400 | |
callq __Znam | |
movq %rax, %r14 | |
movl $400000000, %edi ## imm = 0x17D78400 | |
callq __Znam | |
movdqa LCPI7_0(%rip), %xmm2 ## xmm2 = [0,1,2,3] | |
movl $36, %ecx | |
movdqa LCPI7_1(%rip), %xmm8 ## xmm8 = [4,4,4,4] | |
movdqa LCPI7_2(%rip), %xmm9 ## xmm9 = [8,8,8,8] | |
movdqa LCPI7_3(%rip), %xmm10 ## xmm10 = [12,12,12,12] | |
movdqa LCPI7_4(%rip), %xmm11 ## xmm11 = [16,16,16,16] | |
movdqa LCPI7_5(%rip), %xmm12 ## xmm12 = [20,20,20,20] | |
movdqa LCPI7_6(%rip), %xmm5 ## xmm5 = [24,24,24,24] | |
movdqa LCPI7_7(%rip), %xmm6 ## xmm6 = [28,28,28,28] | |
movdqa LCPI7_8(%rip), %xmm7 ## xmm7 = [32,32,32,32] | |
movdqa LCPI7_9(%rip), %xmm0 ## xmm0 = [36,36,36,36] | |
movdqa LCPI7_10(%rip), %xmm1 ## xmm1 = [40,40,40,40] | |
.p2align 4, 0x90 | |
LBB7_1: ## =>This Inner Loop Header: Depth=1 | |
movdqa %xmm2, %xmm3 | |
paddd %xmm8, %xmm3 | |
movdqu %xmm2, -144(%r14,%rcx,4) | |
movdqu %xmm3, -128(%r14,%rcx,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm9, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm10, %xmm4 | |
movdqu %xmm3, -112(%r14,%rcx,4) | |
movdqu %xmm4, -96(%r14,%rcx,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm11, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm12, %xmm4 | |
movdqu %xmm3, -80(%r14,%rcx,4) | |
movdqu %xmm4, -64(%r14,%rcx,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm5, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm6, %xmm4 | |
movdqu %xmm3, -48(%r14,%rcx,4) | |
movdqu %xmm4, -32(%r14,%rcx,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm7, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm0, %xmm4 | |
movdqu %xmm3, -16(%r14,%rcx,4) | |
movdqu %xmm4, (%r14,%rcx,4) | |
paddd %xmm1, %xmm2 | |
addq $40, %rcx | |
cmpq $100000036, %rcx ## imm = 0x5F5E124 | |
jne LBB7_1 | |
## BB#2: | |
movq %rax, -80(%rbp) ## 8-byte Spill | |
movdqa LCPI7_0(%rip), %xmm2 ## xmm2 = [0,1,2,3] | |
movl $36, %eax | |
.p2align 4, 0x90 | |
LBB7_3: ## =>This Inner Loop Header: Depth=1 | |
movdqa %xmm2, %xmm3 | |
paddd %xmm8, %xmm3 | |
movdqu %xmm2, -144(%r14,%rax,4) | |
movdqu %xmm3, -128(%r14,%rax,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm9, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm10, %xmm4 | |
movdqu %xmm3, -112(%r14,%rax,4) | |
movdqu %xmm4, -96(%r14,%rax,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm11, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm12, %xmm4 | |
movdqu %xmm3, -80(%r14,%rax,4) | |
movdqu %xmm4, -64(%r14,%rax,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm5, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm6, %xmm4 | |
movdqu %xmm3, -48(%r14,%rax,4) | |
movdqu %xmm4, -32(%r14,%rax,4) | |
movdqa %xmm2, %xmm3 | |
paddd %xmm7, %xmm3 | |
movdqa %xmm2, %xmm4 | |
paddd %xmm0, %xmm4 | |
movdqu %xmm3, -16(%r14,%rax,4) | |
movdqu %xmm4, (%r14,%rax,4) | |
paddd %xmm1, %xmm2 | |
addq $40, %rax | |
cmpq $100000036, %rax ## imm = 0x5F5E124 | |
jne LBB7_3 | |
## BB#4: | |
movabsq $-2720673578348880933, %r12 ## imm = 0xDA3E39CB94B95BDB | |
movabsq $-8846114313915602277, %r15 ## imm = 0x853C49E6748FEA9B | |
movabsq $6364136223846793005, %r13 ## imm = 0x5851F42D4C957F2D | |
callq _clock | |
movl $100000000, %r8d ## imm = 0x5F5E100 | |
movq %r15, %rbx | |
movq -80(%rbp), %r9 ## 8-byte Reload | |
.p2align 4, 0x90 | |
LBB7_5: ## =>This Loop Header: Depth=1 | |
## Child Loop BB7_6 Depth 2 | |
movl %r8d, %edi | |
negl %edi | |
.p2align 4, 0x90 | |
LBB7_6: ## Parent Loop BB7_5 Depth=1 | |
## => This Inner Loop Header: Depth=2 | |
movq %rbx, %rcx | |
imulq %r13, %rbx | |
addq %r12, %rbx | |
movq %rcx, %rsi | |
shrq $18, %rsi | |
xorq %rcx, %rsi | |
shrq $27, %rsi | |
shrq $59, %rcx | |
movl %esi, %eax | |
shrl %cl, %eax | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %esi | |
orl %eax, %esi | |
xorl %edx, %edx | |
movl %esi, %eax | |
divl %r8d | |
subl %edx, %esi | |
cmpl %edi, %esi | |
ja LBB7_6 | |
## BB#7: ## in Loop: Header=BB7_5 Depth=1 | |
movl %edx, -4(%r9,%r8,4) | |
cmpq $2, %r8 | |
leaq -1(%r8), %r8 | |
ja LBB7_5 | |
## BB#8: | |
callq _clock | |
leaq L_str(%rip), %rdi | |
callq _puts | |
leaq L_str.9(%rip), %rdi | |
callq _puts | |
callq _clock | |
movq %rax, %xmm0 | |
punpckldq LCPI7_11(%rip), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm0 | |
haddpd %xmm0, %xmm0 | |
movl $100000000, %r8d ## imm = 0x5F5E100 | |
movq %r15, %rbx | |
.p2align 4, 0x90 | |
LBB7_9: ## =>This Loop Header: Depth=1 | |
## Child Loop BB7_10 Depth 2 | |
movl %r8d, %edi | |
negl %edi | |
.p2align 4, 0x90 | |
LBB7_10: ## Parent Loop BB7_9 Depth=1 | |
## => This Inner Loop Header: Depth=2 | |
movq %rbx, %rcx | |
imulq %r13, %rbx | |
addq %r12, %rbx | |
movq %rcx, %rsi | |
shrq $18, %rsi | |
xorq %rcx, %rsi | |
shrq $27, %rsi | |
shrq $59, %rcx | |
movl %esi, %eax | |
shrl %cl, %eax | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %esi | |
orl %eax, %esi | |
xorl %edx, %edx | |
movl %esi, %eax | |
divl %r8d | |
subl %edx, %esi | |
cmpl %edi, %esi | |
ja LBB7_10 | |
## BB#11: ## in Loop: Header=BB7_9 Depth=1 | |
movl %edx, %eax | |
movl -4(%r14,%r8,4), %ecx | |
movl (%r14,%rax,4), %edx | |
movl %edx, -4(%r14,%r8,4) | |
movl %ecx, (%r14,%rax,4) | |
cmpq $2, %r8 | |
leaq -1(%r8), %r8 | |
ja LBB7_9 | |
## BB#12: | |
divsd LCPI7_13(%rip), %xmm0 | |
movapd %xmm0, -64(%rbp) ## 16-byte Spill | |
callq _clock | |
movq %rax, %xmm0 | |
punpckldq LCPI7_11(%rip), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm0 | |
haddpd %xmm0, %xmm0 | |
divsd LCPI7_13(%rip), %xmm0 | |
subsd -64(%rbp), %xmm0 ## 16-byte Folded Reload | |
leaq L_.str.2(%rip), %rdi | |
movb $1, %al | |
callq _printf | |
callq _clock | |
movq %rax, %xmm1 | |
punpckldq LCPI7_11(%rip), %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm1 | |
haddpd %xmm1, %xmm1 | |
movl $100000001, %eax ## imm = 0x5F5E101 | |
movq -80(%rbp), %rdi ## 8-byte Reload | |
.p2align 4, 0x90 | |
LBB7_13: ## =>This Inner Loop Header: Depth=1 | |
movslq -4(%rdi,%rax,4), %rcx | |
movl -8(%r14,%rax,4), %edx | |
movl (%r14,%rcx,4), %esi | |
movl %esi, -8(%r14,%rax,4) | |
movl %edx, (%r14,%rcx,4) | |
movslq -8(%rdi,%rax,4), %rcx | |
movl -12(%r14,%rax,4), %edx | |
movl (%r14,%rcx,4), %esi | |
movl %esi, -12(%r14,%rax,4) | |
movl %edx, (%r14,%rcx,4) | |
movslq -12(%rdi,%rax,4), %rcx | |
movl -16(%r14,%rax,4), %edx | |
movl (%r14,%rcx,4), %esi | |
movl %esi, -16(%r14,%rax,4) | |
movl %edx, (%r14,%rcx,4) | |
addq $-3, %rax | |
cmpq $2, %rax | |
ja LBB7_13 | |
## BB#14: | |
movsd LCPI7_13(%rip), %xmm0 ## xmm0 = mem[0],zero | |
divsd %xmm0, %xmm1 | |
movapd %xmm1, -64(%rbp) ## 16-byte Spill | |
callq _clock | |
movq %rax, %xmm0 | |
movdqa LCPI7_11(%rip), %xmm1 ## xmm1 = [1127219200,1160773632,0,0] | |
punpckldq %xmm1, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] | |
movapd LCPI7_12(%rip), %xmm1 ## xmm1 = [4.503600e+15,1.934281e+25] | |
subpd %xmm1, %xmm0 | |
haddpd %xmm0, %xmm0 | |
divsd LCPI7_13(%rip), %xmm0 | |
subsd -64(%rbp), %xmm0 ## 16-byte Folded Reload | |
leaq L_.str.3(%rip), %rdi | |
movb $1, %al | |
callq _printf | |
leaq L_str.10(%rip), %rdi | |
callq _puts | |
callq _clock | |
movq %rax, %xmm1 | |
punpckldq LCPI7_11(%rip), %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm1 | |
haddpd %xmm1, %xmm1 | |
movl $100000001, %eax ## imm = 0x5F5E101 | |
movq %r15, %rcx | |
movq %rcx, %rdx | |
.p2align 4, 0x90 | |
LBB7_15: ## =>This Inner Loop Header: Depth=1 | |
leaq -1(%rax), %rsi | |
imulq %r13, %rdx | |
addq %r12, %rdx | |
movq %rcx, %rdi | |
shrq $18, %rdi | |
xorq %rcx, %rdi | |
shrq $27, %rdi | |
shrq $59, %rcx | |
movl %edi, %ebx | |
shrl %cl, %ebx | |
negl %ecx | |
## kill: %CL<def> %CL<kill> %RCX<kill> | |
shll %cl, %edi | |
orl %ebx, %edi | |
imulq %rsi, %rdi | |
sarq $32, %rdi | |
movl -8(%r14,%rax,4), %ecx | |
movl (%r14,%rdi,4), %ebx | |
movl %ebx, -8(%r14,%rax,4) | |
movl %ecx, (%r14,%rdi,4) | |
cmpq $2, %rsi | |
movq %rsi, %rax | |
movq %rdx, %rcx | |
ja LBB7_15 | |
## BB#16: | |
movsd LCPI7_13(%rip), %xmm0 ## xmm0 = mem[0],zero | |
divsd %xmm0, %xmm1 | |
movapd %xmm1, -64(%rbp) ## 16-byte Spill | |
callq _clock | |
movq %rax, %xmm0 | |
movdqa LCPI7_11(%rip), %xmm1 ## xmm1 = [1127219200,1160773632,0,0] | |
punpckldq %xmm1, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] | |
movapd LCPI7_12(%rip), %xmm1 ## xmm1 = [4.503600e+15,1.934281e+25] | |
subpd %xmm1, %xmm0 | |
haddpd %xmm0, %xmm0 | |
divsd LCPI7_13(%rip), %xmm0 | |
subsd -64(%rbp), %xmm0 ## 16-byte Folded Reload | |
leaq L_.str.5(%rip), %rdi | |
movb $1, %al | |
callq _printf | |
callq _clock | |
movq %rax, %xmm1 | |
punpckldq LCPI7_11(%rip), %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm1 | |
haddpd %xmm1, %xmm1 | |
movl $100000001, %edx ## imm = 0x5F5E101 | |
movq %r15, %rax | |
movq -80(%rbp), %r8 ## 8-byte Reload | |
.p2align 4, 0x90 | |
LBB7_17: ## =>This Inner Loop Header: Depth=1 | |
leaq -1(%rdx), %rsi | |
imulq %r13, %rax | |
addq %r12, %rax | |
movq %r15, %rdi | |
shrq $18, %rdi | |
xorq %r15, %rdi | |
shrq $27, %rdi | |
shrq $59, %r15 | |
movl %edi, %ebx | |
movl %r15d, %ecx | |
shrl %cl, %ebx | |
negl %r15d | |
movl %r15d, %ecx | |
shll %cl, %edi | |
orl %ebx, %edi | |
imulq %rsi, %rdi | |
shrq $32, %rdi | |
movl %edi, -8(%r8,%rdx,4) | |
cmpq $2, %rsi | |
movq %rsi, %rdx | |
movq %rax, %r15 | |
ja LBB7_17 | |
## BB#18: | |
movsd LCPI7_13(%rip), %xmm0 ## xmm0 = mem[0],zero | |
divsd %xmm0, %xmm1 | |
movapd %xmm1, -64(%rbp) ## 16-byte Spill | |
callq _clock | |
movq %rax, %xmm0 | |
movdqa LCPI7_11(%rip), %xmm1 ## xmm1 = [1127219200,1160773632,0,0] | |
punpckldq %xmm1, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] | |
movapd LCPI7_12(%rip), %xmm1 ## xmm1 = [4.503600e+15,1.934281e+25] | |
subpd %xmm1, %xmm0 | |
haddpd %xmm0, %xmm0 | |
divsd LCPI7_13(%rip), %xmm0 | |
subsd -64(%rbp), %xmm0 ## 16-byte Folded Reload | |
leaq L_.str.6(%rip), %rdi | |
movb $1, %al | |
callq _printf | |
callq _clock | |
movq -80(%rbp), %rdi ## 8-byte Reload | |
movq %rax, %xmm1 | |
punpckldq LCPI7_11(%rip), %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm1 | |
haddpd %xmm1, %xmm1 | |
movl $100000001, %eax ## imm = 0x5F5E101 | |
.p2align 4, 0x90 | |
LBB7_19: ## =>This Inner Loop Header: Depth=1 | |
movslq -8(%rdi,%rax,4), %rcx | |
movl -8(%r14,%rax,4), %edx | |
movl (%r14,%rcx,4), %esi | |
movl %esi, -8(%r14,%rax,4) | |
movl %edx, (%r14,%rcx,4) | |
movslq -12(%rdi,%rax,4), %rcx | |
movl -12(%r14,%rax,4), %edx | |
movl (%r14,%rcx,4), %esi | |
movl %esi, -12(%r14,%rax,4) | |
movl %edx, (%r14,%rcx,4) | |
movslq -16(%rdi,%rax,4), %rcx | |
movl -16(%r14,%rax,4), %edx | |
movl (%r14,%rcx,4), %esi | |
movl %esi, -16(%r14,%rax,4) | |
movl %edx, (%r14,%rcx,4) | |
addq $-3, %rax | |
cmpq $2, %rax | |
ja LBB7_19 | |
## BB#20: | |
movsd LCPI7_13(%rip), %xmm0 ## xmm0 = mem[0],zero | |
divsd %xmm0, %xmm1 | |
movapd %xmm1, -80(%rbp) ## 16-byte Spill | |
callq _clock | |
movq %rax, %xmm0 | |
punpckldq LCPI7_11(%rip), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] | |
subpd LCPI7_12(%rip), %xmm0 | |
haddpd %xmm0, %xmm0 | |
divsd LCPI7_13(%rip), %xmm0 | |
subsd -80(%rbp), %xmm0 ## 16-byte Folded Reload | |
leaq L_.str.7(%rip), %rdi | |
movb $1, %al | |
callq _printf | |
leaq L_str.11(%rip), %rdi | |
callq _puts | |
xorl %eax, %eax | |
addq $40, %rsp | |
popq %rbx | |
popq %r12 | |
popq %r13 | |
popq %r14 | |
popq %r15 | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.section __TEXT,__cstring,cstring_literals | |
L_.str.2: ## @.str.2 | |
.asciz "java-bound PCG shuffle %f s \n" | |
L_.str.3: ## @.str.3 | |
.asciz "precomp shuffle %f s \n" | |
L_.str.5: ## @.str.5 | |
.asciz "biased fast PCG shuffle %f s \n" | |
L_.str.6: ## @.str.6 | |
.asciz "fast comp. of indexes %f s \n" | |
L_.str.7: ## @.str.7 | |
.asciz "precomp shuffle %f s \n" | |
.p2align 4 ## @str | |
L_str: | |
.asciz "Reproducing the Java numbers from blog post https://lemire.me/blog/2018/03/24/when-shuffling-large-arrays-how-much-time-can-be-attributed-to-random-number-generation/#comments " | |
.p2align 4 ## @str.9 | |
L_str.9: | |
.asciz "Caveat: we use PCG instead of the LCG from Java." | |
.p2align 4 ## @str.10 | |
L_str.10: | |
.asciz "\n\n rest is from Arseny Kapoulkine's original code: " | |
L_str.11: ## @str.11 | |
.asciz "\n\n" | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment