Skip to content

Instantly share code, notes, and snippets.

@kunalspathak
Last active May 13, 2024 16:17
Show Gist options
  • Save kunalspathak/93bfb317781ba4f91000df4ef8c9eeaa to your computer and use it in GitHub Desktop.
Save kunalspathak/93bfb317781ba4f91000df4ef8c9eeaa to your computer and use it in GitHub Desktop.
Loop performance comparison
Method Duration (in ms) Code size (in bytes)
sum_scalar 12253 64
sum_vector128 4865 128
sum_advsimd 4867 128
sum_sve (128-bits) 4890 84
sum_sve (256-bits) 2576 84
using System;
using System.Diagnostics;
using System.IO;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using Xunit;
namespace CodeGenTests
{
public static class SveSum
{
// [method: MethodImpl(MethodImplOptions.NoInlining)]
public static unsafe int sum_scalar(int* src, int length)
{
int ret = 0;
for (int i=0; i<length; i++)
{
ret += src[i];
}
return ret;
}
//[method: MethodImpl(MethodImplOptions.NoInlining)]
public static unsafe int sum_vector128(int* srcBytes, int length)
{
Vector128<int> total = Vector128.Create((int)0);
int* src = srcBytes;
int vector_length = 16/sizeof(int);
for (int i = 0; i+vector_length <= length; i+=vector_length)
{
Vector128<int> vec = Vector128.LoadUnsafe(ref *src);
total = Vector128.Add(vec, total);
src += vector_length;
}
return Vector128.Sum(total) + sum_scalar(src, length%vector_length);
}
//[method: MethodImpl(MethodImplOptions.NoInlining)]
public static unsafe int sum_advsimd(int* srcBytes, int length)
{
Vector128<int> total = Vector128.Create((int)0);
int* src = srcBytes;
int vector_length = 16/sizeof(int);
for (int i = 0; i+vector_length <= length; i+=vector_length)
{
Vector128<int> vec = AdvSimd.LoadVector128(srcBytes);
total = AdvSimd.Add(vec, total);
srcBytes += vector_length;
}
return AdvSimd.Arm64.AddAcross(total).ToScalar() + sum_scalar(srcBytes, length%vector_length);
}
//[method: MethodImpl(MethodImplOptions.NoInlining)]
public static unsafe int sum_sve(int* srcBytes, int length)
{
Vector<int> total = new Vector<int>(0);
int* src = srcBytes;
int elems = (int)Sve.Count32BitElements();
for (int i = 0; i < length + elems; i += elems)
{
Vector<int> vec = Sve.LoadVector((Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, length), src);
total = Sve.ConditionalSelect((Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, length), Sve.Add(total, vec), total);
src += elems;
}
return (int)Sve.AddAcross(total).ToScalar();
}
//// For performance reasons, it may be better to use an unpredicated loop, followed by a tail.
// [method: MethodImpl(MethodImplOptions.NoInlining)]
//public static unsafe int sum_sve_unpredicated_loop(ref int* srcBytes, int length)
//{
// Vector<int> total = new Vector<int>(0);
// int* src = srcBytes;
// int elems = (int)Sve.Count32BitElements();
// Vector<int> vec, pred;
// int i = 0;
// for (i = 0; i < length; i += elems)
// {
// pred = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, length);
// vec = Sve.LoadVector(pred, src);
// total = Sve.ConditionalSelect(pred, Sve.Add(total, vec), total);
// }
// // Predicated tail.
// pred = Sve.CreateWhileLessThanMask32Bit(i, length);
// vec = Sve.LoadUnsafe(pred, ref *src, i);
// total = Sve.MergeAdd(pred, vec, total);
// return Sve.AddAcross(total).ToScalar();
//}
[Fact]
public static unsafe int TestEntryPoint()
{
//string[] inputs = File.ReadAllLines("input.txt");
//Console.WriteLine($"Read {inputs.Length}");
//for (int i = 0; i < inputs.Length; i++)
//{
// Console.WriteLine(inputs[i]);
//}
//int loop = int.Parse(inputs[0]);
//int length = int.Parse(inputs[1]);
int loop = 5000;
int length = 5000000;
Span<int> source = new int[length];
for (int i = 0; i < source.Length; i++)
{
source[i] = (int)i;
}
Stopwatch sw = new Stopwatch();
fixed (int* srcBytes = &MemoryMarshal.GetReference(source))
{
int RESULT = sum_scalar(srcBytes, length);
sw.Restart();
for (int inner = 0; inner < loop; inner++)
{
int* src = srcBytes;
if (sum_scalar(src, length) != RESULT)
{
return 0;
}
}
sw.Stop();
Console.WriteLine($"sum_scalar took {sw.ElapsedMilliseconds} msec");
sw.Restart();
for (int inner = 0; inner < loop; inner++)
{
int* src = srcBytes;
if (sum_vector128(src, length) != RESULT)
{
return 0;
}
}
sw.Stop();
Console.WriteLine($"sum_vector128 took {sw.ElapsedMilliseconds} msec");
sw.Restart();
for (int inner = 0; inner < loop; inner++)
{
int* src = srcBytes;
if (sum_advsimd(src, length) != RESULT)
{
return 0;
}
}
sw.Stop();
Console.WriteLine($"sum_advsimd took {sw.ElapsedMilliseconds} msec");
sw.Restart();
for (int inner = 0; inner < loop; inner++)
{
int* src = srcBytes;
if (sum_sve(src, length) != RESULT)
{
return 0;
}
}
sw.Stop();
Console.WriteLine($"sum_sve took {sw.ElapsedMilliseconds} msec");
}
return 100;
}
}
}
; Assembly listing for method CodeGenTests.SveSum:sum_scalar(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T03] (  3,  6   )    long  ->   x0         single-def
;  V01 arg1         [V01,T02] (  4,  7   )     int  ->   x1         single-def
;  V02 loc0         [V02,T01] (  4, 10   )     int  ->   x2        
;  V03 loc1         [V03,T00] (  5, 17   )     int  ->   x3        
;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [sp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M36051_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M36051_IG02:  ;; offset=0x0008
            mov     w2, wzr
            mov     w3, wzr
            cmp     w1, #0
            ble     G_M36051_IG04
            align   [4 bytes for IG03]
            align   [4 bytes]
            align   [0 bytes]
            align   [0 bytes]
						;; size=24 bbWeight=1 PerfScore 3.50
G_M36051_IG03:  ;; offset=0x0020
            ldr     w4, [x0, w3, SXTW #2]
            add     w2, w4, w2
            add     w3, w3, #1
            cmp     w3, w1
            blt     G_M36051_IG03
						;; size=20 bbWeight=4 PerfScore 22.00
G_M36051_IG04:  ;; offset=0x0034
            mov     w0, w2
						;; size=4 bbWeight=1 PerfScore 0.50
G_M36051_IG05:  ;; offset=0x0038
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 64, prolog size 8, PerfScore 29.50, instruction count 18, allocated bytes for code 64 (MethodHash=113d732c) for method CodeGenTests.SveSum:sum_scalar(ulong,int):int (FullOpts)
; ============================================================

sum_scalar 501501
; Assembly listing for method CodeGenTests.SveSum:sum_vector128(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T03] (  3,  3   )    long  ->   x0         single-def
;  V01 arg1         [V01,T02] (  6,  9   )     int  ->   x1         single-def
;  V02 loc0         [V02,T05] (  4, 10   )  simd16  ->   d8         HFA(simd16)  <System.Runtime.Intrinsics.Vector128`1[int]>
;  V03 loc1         [V03,T00] (  5, 14   )    long  ->   x0        
;* V04 loc2         [V04,T04] (  0,  0   )     int  ->  zero-ref    single-def
;  V05 loc3         [V05,T01] (  4, 13   )     int  ->   x2        
;* V06 loc4         [V06    ] (  0,  0   )  simd16  ->  zero-ref    HFA(simd16)  <System.Runtime.Intrinsics.Vector128`1[int]>
;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [sp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M37183_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x20]!
            stp     d8, d9, [sp, #0x10]
            mov     fp, sp
						;; size=12 bbWeight=1 PerfScore 2.50
G_M37183_IG02:  ;; offset=0x000C
            movi    v8.4s, #0
            mov     w2, wzr
            cmp     w1, #4
            blt     G_M37183_IG04
            align   [4 bytes for IG03]
            align   [0 bytes]
            align   [0 bytes]
            align   [0 bytes]
						;; size=20 bbWeight=1 PerfScore 3.00
G_M37183_IG03:  ;; offset=0x0020
            ldr     q16, [x0]
            add     v8.4s, v16.4s, v8.4s
            add     x0, x0, #16
            add     w2, w2, #4
            add     w3, w2, #4
            cmp     w3, w1
            ble     G_M37183_IG03
						;; size=28 bbWeight=4 PerfScore 28.00
G_M37183_IG04:  ;; offset=0x003C
            and     w2, w1, #3
            negs    w1, w1
            and     w1, w1, #3
            csneg   w1, w2, w1, mi
            movz    x2, #0xD3C8      // code for CodeGenTests.SveSum:sum_scalar(ulong,int):int
            movk    x2, #0x3A78 LSL #16
            movk    x2, #0x7FFE LSL #32
            ldr     x2, [x2]
            mov     v9.d[0], v8.d[1]
            blr     x2
            mov     v8.d[1], v9.d[0]
            addv    s16, v8.4s
            smov    x1, v16.s[0]
            add     w0, w0, w1
						;; size=56 bbWeight=1 PerfScore 13.00
G_M37183_IG05:  ;; offset=0x0074
            ldp     d8, d9, [sp, #0x10]
            ldp     fp, lr, [sp], #0x20
            ret     lr
						;; size=12 bbWeight=1 PerfScore 3.00

; Total bytes of code 128, prolog size 12, PerfScore 49.50, instruction count 35, allocated bytes for code 128 (MethodHash=8c776ec0) for method CodeGenTests.SveSum:sum_vector128(ulong,int):int (FullOpts)
; ============================================================

sum_vector128 501501
; Assembly listing for method CodeGenTests.SveSum:sum_advsimd(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  6, 15   )    long  ->   x0        
;  V01 arg1         [V01,T02] (  6,  9   )     int  ->   x1         single-def
;  V02 loc0         [V02,T04] (  4, 10   )  simd16  ->   d8         HFA(simd16)  <System.Runtime.Intrinsics.Vector128`1[int]>
;* V03 loc1         [V03    ] (  0,  0   )    long  ->  zero-ref   
;* V04 loc2         [V04,T03] (  0,  0   )     int  ->  zero-ref    single-def
;  V05 loc3         [V05,T01] (  4, 13   )     int  ->   x2        
;* V06 loc4         [V06    ] (  0,  0   )  simd16  ->  zero-ref    HFA(simd16)  <System.Runtime.Intrinsics.Vector128`1[int]>
;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [sp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M49885_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x20]!
            stp     d8, d9, [sp, #0x10]
            mov     fp, sp
						;; size=12 bbWeight=1 PerfScore 2.50
G_M49885_IG02:  ;; offset=0x000C
            movi    v8.4s, #0
            mov     w2, wzr
            cmp     w1, #4
            blt     G_M49885_IG04
            align   [4 bytes for IG03]
            align   [0 bytes]
            align   [0 bytes]
            align   [0 bytes]
						;; size=20 bbWeight=1 PerfScore 3.00
G_M49885_IG03:  ;; offset=0x0020
            ldr     q16, [x0]
            add     v8.4s, v16.4s, v8.4s
            add     x0, x0, #16
            add     w2, w2, #4
            add     w3, w2, #4
            cmp     w3, w1
            ble     G_M49885_IG03
						;; size=28 bbWeight=4 PerfScore 28.00
G_M49885_IG04:  ;; offset=0x003C
            and     w2, w1, #3
            negs    w1, w1
            and     w1, w1, #3
            csneg   w1, w2, w1, mi
            movz    x2, #0xD3C8      // code for CodeGenTests.SveSum:sum_scalar(ulong,int):int
            movk    x2, #0x3A78 LSL #16
            movk    x2, #0x7FFE LSL #32
            ldr     x2, [x2]
            mov     v9.d[0], v8.d[1]
            blr     x2
            mov     v8.d[1], v9.d[0]
            addv    s16, v8.4s
            smov    x1, v16.s[0]
            add     w0, w0, w1
						;; size=56 bbWeight=1 PerfScore 13.00
G_M49885_IG05:  ;; offset=0x0074
            ldp     d8, d9, [sp, #0x10]
            ldp     fp, lr, [sp], #0x20
            ret     lr
						;; size=12 bbWeight=1 PerfScore 3.00

; Total bytes of code 128, prolog size 12, PerfScore 49.50, instruction count 35, allocated bytes for code 128 (MethodHash=3be93d22) for method CodeGenTests.SveSum:sum_advsimd(ulong,int):int (FullOpts)
; ============================================================

sum_advsimd 501501
; Assembly listing for method CodeGenTests.SveSum:sum_sve(ulong,int):int (FullOpts)
; Emitting BLENDED_CODE for generic ARM64 - Windows
; FullOpts code
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T06] (  3,  3   )    long  ->   x0         single-def
;  V01 arg1         [V01,T03] (  4,  7   )     int  ->   x1         single-def
;  V02 loc0         [V02,T08] (  5, 14   )  simd16  ->  d16         HFA(simd16)  ld-addr-op <System.Numerics.Vector`1[int]>
;  V03 loc1         [V03,T01] (  4, 13   )    long  ->   x0        
;  V04 loc2         [V04,T04] (  4,  6.25)     int  ->   x2         single-def
;  V05 loc3         [V05,T00] (  5, 17   )     int  ->   x3        
;* V06 loc4         [V06    ] (  0,  0   )  simd16  ->  zero-ref    HFA(simd16)  <System.Numerics.Vector`1[int]>
;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [sp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;  V08 cse0         [V08,T07] (  2,  4.25)    long  ->   x5         hoist "CSE #04: aggressive"
;  V09 cse1         [V09,T05] (  3,  6   )     int  ->   x4         "CSE #01: aggressive"
;  V10 cse2         [V10,T02] (  3, 12   )    mask  ->   d7         "CSE #02: aggressive"
;
; Lcl frame size = 0

G_M52765_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M52765_IG02:  ;; offset=0x0008
            movi    v16.4s, #0
            cntw    x2, all
            mov     w3, wzr
            add     w4, w1, w2
            cmp     w4, #0
            ble     G_M52765_IG05
						;; size=24 bbWeight=1 PerfScore 5.00
G_M52765_IG03:  ;; offset=0x0020
            sbfiz   x5, x2, #2, #32
            align   [0 bytes for IG04]
            align   [0 bytes]
            align   [0 bytes]
            align   [0 bytes]
						;; size=4 bbWeight=0.25 PerfScore 0.25
G_M52765_IG04:  ;; offset=0x0024
            whilelt p7.s, w3, w1
            ld1w    { z17.s }, p7/z, [x0]
            add     z16.s, p7/m, z16.s, z17.s
            add     x0, x0, x5
            add     w3, w3, w2
            cmp     w4, w3
            bgt     G_M52765_IG04
						;; size=28 bbWeight=4 PerfScore 54.00
G_M52765_IG05:  ;; offset=0x0040
            ptrue   p7.s
            saddv   d16, p7, z16.s
            umov    x0, v16.d[0]
						;; size=12 bbWeight=1 PerfScore 6.00
G_M52765_IG06:  ;; offset=0x004C
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 84, prolog size 8, PerfScore 68.75, instruction count 25, allocated bytes for code 84 (MethodHash=eac331e2) for method CodeGenTests.SveSum:sum_sve(ulong,int):int (FullOpts)
; ============================================================

sum_sve 501501
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment