Method | Duration (in ms) | Code size (in bytes) |
---|---|---|
sum_scalar | 12253 | 64 |
sum_vector128 | 4865 | 128 |
sum_advsimd | 4867 | 128 |
sum_sve (128-bits) | 4890 | 84 |
sum_sve (256-bits) | 2576 | 84 |
Last active
May 13, 2024 16:17
-
-
Save kunalspathak/93bfb317781ba4f91000df4ef8c9eeaa to your computer and use it in GitHub Desktop.
Loop performance comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Numerics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.Arm; | |
using Xunit; | |
namespace CodeGenTests | |
{ | |
public static class SveSum | |
{ | |
// [method: MethodImpl(MethodImplOptions.NoInlining)] | |
public static unsafe int sum_scalar(int* src, int length) | |
{ | |
int ret = 0; | |
for (int i=0; i<length; i++) | |
{ | |
ret += src[i]; | |
} | |
return ret; | |
} | |
//[method: MethodImpl(MethodImplOptions.NoInlining)] | |
public static unsafe int sum_vector128(int* srcBytes, int length) | |
{ | |
Vector128<int> total = Vector128.Create((int)0); | |
int* src = srcBytes; | |
int vector_length = 16/sizeof(int); | |
for (int i = 0; i+vector_length <= length; i+=vector_length) | |
{ | |
Vector128<int> vec = Vector128.LoadUnsafe(ref *src); | |
total = Vector128.Add(vec, total); | |
src += vector_length; | |
} | |
return Vector128.Sum(total) + sum_scalar(src, length%vector_length); | |
} | |
//[method: MethodImpl(MethodImplOptions.NoInlining)] | |
public static unsafe int sum_advsimd(int* srcBytes, int length) | |
{ | |
Vector128<int> total = Vector128.Create((int)0); | |
int* src = srcBytes; | |
int vector_length = 16/sizeof(int); | |
for (int i = 0; i+vector_length <= length; i+=vector_length) | |
{ | |
Vector128<int> vec = AdvSimd.LoadVector128(srcBytes); | |
total = AdvSimd.Add(vec, total); | |
srcBytes += vector_length; | |
} | |
return AdvSimd.Arm64.AddAcross(total).ToScalar() + sum_scalar(srcBytes, length%vector_length); | |
} | |
//[method: MethodImpl(MethodImplOptions.NoInlining)] | |
public static unsafe int sum_sve(int* srcBytes, int length) | |
{ | |
Vector<int> total = new Vector<int>(0); | |
int* src = srcBytes; | |
int elems = (int)Sve.Count32BitElements(); | |
for (int i = 0; i < length + elems; i += elems) | |
{ | |
Vector<int> vec = Sve.LoadVector((Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, length), src); | |
total = Sve.ConditionalSelect((Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, length), Sve.Add(total, vec), total); | |
src += elems; | |
} | |
return (int)Sve.AddAcross(total).ToScalar(); | |
} | |
//// For performance reasons, it may be better to use an unpredicated loop, followed by a tail. | |
// [method: MethodImpl(MethodImplOptions.NoInlining)] | |
//public static unsafe int sum_sve_unpredicated_loop(ref int* srcBytes, int length) | |
//{ | |
// Vector<int> total = new Vector<int>(0); | |
// int* src = srcBytes; | |
// int elems = (int)Sve.Count32BitElements(); | |
// Vector<int> vec, pred; | |
// int i = 0; | |
// for (i = 0; i < length; i += elems) | |
// { | |
// pred = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, length); | |
// vec = Sve.LoadVector(pred, src); | |
// total = Sve.ConditionalSelect(pred, Sve.Add(total, vec), total); | |
// } | |
// // Predicated tail. | |
// pred = Sve.CreateWhileLessThanMask32Bit(i, length); | |
// vec = Sve.LoadUnsafe(pred, ref *src, i); | |
// total = Sve.MergeAdd(pred, vec, total); | |
// return Sve.AddAcross(total).ToScalar(); | |
//} | |
[Fact] | |
public static unsafe int TestEntryPoint() | |
{ | |
//string[] inputs = File.ReadAllLines("input.txt"); | |
//Console.WriteLine($"Read {inputs.Length}"); | |
//for (int i = 0; i < inputs.Length; i++) | |
//{ | |
// Console.WriteLine(inputs[i]); | |
//} | |
//int loop = int.Parse(inputs[0]); | |
//int length = int.Parse(inputs[1]); | |
int loop = 5000; | |
int length = 5000000; | |
Span<int> source = new int[length]; | |
for (int i = 0; i < source.Length; i++) | |
{ | |
source[i] = (int)i; | |
} | |
Stopwatch sw = new Stopwatch(); | |
fixed (int* srcBytes = &MemoryMarshal.GetReference(source)) | |
{ | |
int RESULT = sum_scalar(srcBytes, length); | |
sw.Restart(); | |
for (int inner = 0; inner < loop; inner++) | |
{ | |
int* src = srcBytes; | |
if (sum_scalar(src, length) != RESULT) | |
{ | |
return 0; | |
} | |
} | |
sw.Stop(); | |
Console.WriteLine($"sum_scalar took {sw.ElapsedMilliseconds} msec"); | |
sw.Restart(); | |
for (int inner = 0; inner < loop; inner++) | |
{ | |
int* src = srcBytes; | |
if (sum_vector128(src, length) != RESULT) | |
{ | |
return 0; | |
} | |
} | |
sw.Stop(); | |
Console.WriteLine($"sum_vector128 took {sw.ElapsedMilliseconds} msec"); | |
sw.Restart(); | |
for (int inner = 0; inner < loop; inner++) | |
{ | |
int* src = srcBytes; | |
if (sum_advsimd(src, length) != RESULT) | |
{ | |
return 0; | |
} | |
} | |
sw.Stop(); | |
Console.WriteLine($"sum_advsimd took {sw.ElapsedMilliseconds} msec"); | |
sw.Restart(); | |
for (int inner = 0; inner < loop; inner++) | |
{ | |
int* src = srcBytes; | |
if (sum_sve(src, length) != RESULT) | |
{ | |
return 0; | |
} | |
} | |
sw.Stop(); | |
Console.WriteLine($"sum_sve took {sw.ElapsedMilliseconds} msec"); | |
} | |
return 100; | |
} | |
} | |
} |
; Assembly listing for method CodeGenTests.SveSum:sum_scalar(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 3, 6 ) long -> x0 single-def
; V01 arg1 [V01,T02] ( 4, 7 ) int -> x1 single-def
; V02 loc0 [V02,T01] ( 4, 10 ) int -> x2
; V03 loc1 [V03,T00] ( 5, 17 ) int -> x3
;# V04 OutArgs [V04 ] ( 1, 1 ) struct ( 0) [sp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M36051_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x10]!
mov fp, sp
;; size=8 bbWeight=1 PerfScore 1.50
G_M36051_IG02: ;; offset=0x0008
mov w2, wzr
mov w3, wzr
cmp w1, #0
ble G_M36051_IG04
align [4 bytes for IG03]
align [4 bytes]
align [0 bytes]
align [0 bytes]
;; size=24 bbWeight=1 PerfScore 3.50
G_M36051_IG03: ;; offset=0x0020
ldr w4, [x0, w3, SXTW #2]
add w2, w4, w2
add w3, w3, #1
cmp w3, w1
blt G_M36051_IG03
;; size=20 bbWeight=4 PerfScore 22.00
G_M36051_IG04: ;; offset=0x0034
mov w0, w2
;; size=4 bbWeight=1 PerfScore 0.50
G_M36051_IG05: ;; offset=0x0038
ldp fp, lr, [sp], #0x10
ret lr
;; size=8 bbWeight=1 PerfScore 2.00
; Total bytes of code 64, prolog size 8, PerfScore 29.50, instruction count 18, allocated bytes for code 64 (MethodHash=113d732c) for method CodeGenTests.SveSum:sum_scalar(ulong,int):int (FullOpts)
; ============================================================
sum_scalar 501501
; Assembly listing for method CodeGenTests.SveSum:sum_vector128(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 3, 3 ) long -> x0 single-def
; V01 arg1 [V01,T02] ( 6, 9 ) int -> x1 single-def
; V02 loc0 [V02,T05] ( 4, 10 ) simd16 -> d8 HFA(simd16) <System.Runtime.Intrinsics.Vector128`1[int]>
; V03 loc1 [V03,T00] ( 5, 14 ) long -> x0
;* V04 loc2 [V04,T04] ( 0, 0 ) int -> zero-ref single-def
; V05 loc3 [V05,T01] ( 4, 13 ) int -> x2
;* V06 loc4 [V06 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) <System.Runtime.Intrinsics.Vector128`1[int]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [sp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M37183_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x20]!
stp d8, d9, [sp, #0x10]
mov fp, sp
;; size=12 bbWeight=1 PerfScore 2.50
G_M37183_IG02: ;; offset=0x000C
movi v8.4s, #0
mov w2, wzr
cmp w1, #4
blt G_M37183_IG04
align [4 bytes for IG03]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=20 bbWeight=1 PerfScore 3.00
G_M37183_IG03: ;; offset=0x0020
ldr q16, [x0]
add v8.4s, v16.4s, v8.4s
add x0, x0, #16
add w2, w2, #4
add w3, w2, #4
cmp w3, w1
ble G_M37183_IG03
;; size=28 bbWeight=4 PerfScore 28.00
G_M37183_IG04: ;; offset=0x003C
and w2, w1, #3
negs w1, w1
and w1, w1, #3
csneg w1, w2, w1, mi
movz x2, #0xD3C8 // code for CodeGenTests.SveSum:sum_scalar(ulong,int):int
movk x2, #0x3A78 LSL #16
movk x2, #0x7FFE LSL #32
ldr x2, [x2]
mov v9.d[0], v8.d[1]
blr x2
mov v8.d[1], v9.d[0]
addv s16, v8.4s
smov x1, v16.s[0]
add w0, w0, w1
;; size=56 bbWeight=1 PerfScore 13.00
G_M37183_IG05: ;; offset=0x0074
ldp d8, d9, [sp, #0x10]
ldp fp, lr, [sp], #0x20
ret lr
;; size=12 bbWeight=1 PerfScore 3.00
; Total bytes of code 128, prolog size 12, PerfScore 49.50, instruction count 35, allocated bytes for code 128 (MethodHash=8c776ec0) for method CodeGenTests.SveSum:sum_vector128(ulong,int):int (FullOpts)
; ============================================================
sum_vector128 501501
; Assembly listing for method CodeGenTests.SveSum:sum_advsimd(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 6, 15 ) long -> x0
; V01 arg1 [V01,T02] ( 6, 9 ) int -> x1 single-def
; V02 loc0 [V02,T04] ( 4, 10 ) simd16 -> d8 HFA(simd16) <System.Runtime.Intrinsics.Vector128`1[int]>
;* V03 loc1 [V03 ] ( 0, 0 ) long -> zero-ref
;* V04 loc2 [V04,T03] ( 0, 0 ) int -> zero-ref single-def
; V05 loc3 [V05,T01] ( 4, 13 ) int -> x2
;* V06 loc4 [V06 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) <System.Runtime.Intrinsics.Vector128`1[int]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [sp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M49885_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x20]!
stp d8, d9, [sp, #0x10]
mov fp, sp
;; size=12 bbWeight=1 PerfScore 2.50
G_M49885_IG02: ;; offset=0x000C
movi v8.4s, #0
mov w2, wzr
cmp w1, #4
blt G_M49885_IG04
align [4 bytes for IG03]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=20 bbWeight=1 PerfScore 3.00
G_M49885_IG03: ;; offset=0x0020
ldr q16, [x0]
add v8.4s, v16.4s, v8.4s
add x0, x0, #16
add w2, w2, #4
add w3, w2, #4
cmp w3, w1
ble G_M49885_IG03
;; size=28 bbWeight=4 PerfScore 28.00
G_M49885_IG04: ;; offset=0x003C
and w2, w1, #3
negs w1, w1
and w1, w1, #3
csneg w1, w2, w1, mi
movz x2, #0xD3C8 // code for CodeGenTests.SveSum:sum_scalar(ulong,int):int
movk x2, #0x3A78 LSL #16
movk x2, #0x7FFE LSL #32
ldr x2, [x2]
mov v9.d[0], v8.d[1]
blr x2
mov v8.d[1], v9.d[0]
addv s16, v8.4s
smov x1, v16.s[0]
add w0, w0, w1
;; size=56 bbWeight=1 PerfScore 13.00
G_M49885_IG05: ;; offset=0x0074
ldp d8, d9, [sp, #0x10]
ldp fp, lr, [sp], #0x20
ret lr
;; size=12 bbWeight=1 PerfScore 3.00
; Total bytes of code 128, prolog size 12, PerfScore 49.50, instruction count 35, allocated bytes for code 128 (MethodHash=3be93d22) for method CodeGenTests.SveSum:sum_advsimd(ulong,int):int (FullOpts)
; ============================================================
sum_advsimd 501501
; Assembly listing for method CodeGenTests.SveSum:sum_sve(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T06] ( 3, 3 ) long -> x0 single-def
; V01 arg1 [V01,T03] ( 4, 7 ) int -> x1 single-def
; V02 loc0 [V02,T08] ( 5, 14 ) simd16 -> d16 HFA(simd16) ld-addr-op <System.Numerics.Vector`1[int]>
; V03 loc1 [V03,T01] ( 4, 13 ) long -> x0
; V04 loc2 [V04,T04] ( 4, 6.25) int -> x2 single-def
; V05 loc3 [V05,T00] ( 5, 17 ) int -> x3
;* V06 loc4 [V06 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) <System.Numerics.Vector`1[int]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [sp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V08 cse0 [V08,T07] ( 2, 4.25) long -> x5 hoist "CSE #04: aggressive"
; V09 cse1 [V09,T05] ( 3, 6 ) int -> x4 "CSE #01: aggressive"
; V10 cse2 [V10,T02] ( 3, 12 ) mask -> d7 "CSE #02: aggressive"
;
; Lcl frame size = 0
G_M52765_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x10]!
mov fp, sp
;; size=8 bbWeight=1 PerfScore 1.50
G_M52765_IG02: ;; offset=0x0008
movi v16.4s, #0
cntw x2, all
mov w3, wzr
add w4, w1, w2
cmp w4, #0
ble G_M52765_IG05
;; size=24 bbWeight=1 PerfScore 5.00
G_M52765_IG03: ;; offset=0x0020
sbfiz x5, x2, #2, #32
align [0 bytes for IG04]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=4 bbWeight=0.25 PerfScore 0.25
G_M52765_IG04: ;; offset=0x0024
whilelt p7.s, w3, w1
ld1w { z17.s }, p7/z, [x0]
add z16.s, p7/m, z16.s, z17.s
add x0, x0, x5
add w3, w3, w2
cmp w4, w3
bgt G_M52765_IG04
;; size=28 bbWeight=4 PerfScore 54.00
G_M52765_IG05: ;; offset=0x0040
ptrue p7.s
saddv d16, p7, z16.s
umov x0, v16.d[0]
;; size=12 bbWeight=1 PerfScore 6.00
G_M52765_IG06: ;; offset=0x004C
ldp fp, lr, [sp], #0x10
ret lr
;; size=8 bbWeight=1 PerfScore 2.00
; Total bytes of code 84, prolog size 8, PerfScore 68.75, instruction count 25, allocated bytes for code 84 (MethodHash=eac331e2) for method CodeGenTests.SveSum:sum_sve(ulong,int):int (FullOpts)
; ============================================================
sum_sve 501501
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment