Skip to content

Instantly share code, notes, and snippets.

@kunalspathak
Last active April 2, 2020 09:20
Show Gist options
  • Save kunalspathak/398a14edae6367033a6656a8a62c6486 to your computer and use it in GitHub Desktop.
Save kunalspathak/398a14edae6367033a6656a8a62c6486 to your computer and use it in GitHub Desktop.

Source code : https://source.dot.net/#System.Private.CoreLib/BitOperations.cs,203

00007ff8`119ccce4 910003fd mov         fp,sp
00007ff8`119ccce8 53017c01 lsr         w1,w0,#1
00007ff8`119cccec 1200f021 and         w1,w1,#0x55555555
00007ff8`119cccf0 4b010000 sub         w0,w0,w1
00007ff8`119cccf4 1200e401 and         w1,w0,#0x33333333
00007ff8`119cccf8 53027c00 lsr         w0,w0,#2
00007ff8`119cccfc 1200e400 and         w0,w0,#0x33333333
00007ff8`119ccd00 0b000020 add         w0,w1,w0
00007ff8`119ccd04 53047c01 lsr         w1,w0,#4
00007ff8`119ccd08 0b010000 add         w0,w0,w1
00007ff8`119ccd0c 1200cc00 and         w0,w0,#0xF0F0F0F
00007ff8`119ccd10 3200c3e1 mov         w1,#0x1010101
00007ff8`119ccd14 1b017c00 mul         w0,w0,w1
00007ff8`119ccd18 53187c00 lsr         w0,w0,#0x18
00007ff8`119ccd1c a8c17bfd ldp         fp,lr,[sp],#0x10
public  static int Custom_PopCount() {
    int result;
    // Step to convert uint "value" to byte[]
    byte[] values = BitConverter.GetBytes(value);
    unsafe 
    {
      fixed (byte* dataPtr = values)
      {
        // Load the byte[] in Vector64
        Vector64<byte> input = AdvSimd.LoadVector64(dataPtr);
        Vector64<byte> output = AdvSimd.PopCount(input);
        
        // Accumulate the result of PopCount across all lanes
        Vector64<byte> added = AdvSimd.Arm64.AddAcross(output);
        
        // It is safe to take the 0th index because for popcount,
        // AddAcross will never evaluate to > 255.
        result = (int)Vector64.GetElement<byte>(added, 0);
      }
    }
    return result;
  }
Assembly code
; Assembly listing for method projs.PopCountTester:Custom_PopCount():int
; Emitting BLENDED_CODE for generic ARM64 CPU - Windows
; optimized code
; fp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 loc0         [V00,T04] (  3,  2   )    long  ->   x0
;  V01 loc1         [V01    ] (  5,  3.50)     ref  ->  [fp+0x28]   must-init pinned class-hnd
;  V02 loc2         [V02    ] (  3,  3   )   simd8  ->  [fp+0x18]   HFA(double)  do-not-enreg[XS] must-init addr-exposed ld-addr-op
;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [sp+0x00]   "OutgoingArgSpace"
;  V04 tmp1         [V04,T00] (  3,  6   )     ref  ->   x0         class-hnd "dup spill"
;  V05 tmp2         [V05,T03] (  2,  4   )     int  ->  x19         "non-inline candidate call"
;  V06 tmp3         [V06,T05] (  2,  2   )    long  ->   x0         "Cast away GC"
;  V07 tmp4         [V07,T01] (  2,  4   )     ref  ->   x1         "argument with side effect"
;  V08 tmp5         [V08,T02] (  2,  4   )     ref  ->  x20         "argument with side effect"
;
; Lcl frame size = 32

G_M31327_IG01:
        A9BC7BFD          stp     fp, lr, [sp,#-64]!
        A90353F3          stp     x19, x20, [sp,#48]
        910003FD          mov     fp, sp
        F90017BF          str     xzr, [fp,#40] // [V01 loc1]
        F9000FBF          str     xzr, [fp,#24] // [V02 loc2]
                                                ;; bbWeight=1    PerfScore 4.50
G_M31327_IG02:
        D2933700          movz    x0, #0x99b8
        F2B657E0          movk    x0, #0xb2bf LSL #16
        F2CFFEE0          movk    x0, #0x7ff7 LSL #32
        52800021          mov     w1, #1
        97FF4D4F          bl      CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
        D2933D80          movz    x0, #0x99ec
        F2B657E0          movk    x0, #0xb2bf LSL #16
        F2CFFEE0          movk    x0, #0x7ff7 LSL #32
        B9400000          ldr     w0, [x0]
        97FFB3FC          bl      System.BitConverter:GetBytes(int):System.Byte[]
        F90017A0          str     x0, [fp,#40]  // [V01 loc1]
        B4000080          cbz     x0, G_M31327_IG04
                                                ;; bbWeight=1    PerfScore 10.50
G_M31327_IG03:
        F94017A0          ldr     x0, [fp,#40]  // [V01 loc1]
        B9400800          ldr     w0, [x0,#8]
        35000060          cbnz    w0, G_M31327_IG05
                                                ;; bbWeight=0.50 PerfScore 3.00
G_M31327_IG04:
        D2800000          mov     x0, #0
        14000007          b       G_M31327_IG06
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M31327_IG05:
        F94017A0          ldr     x0, [fp,#40]  // [V01 loc1]
        B9400800          ldr     w0, [x0,#8]
        7100001F          cmp     w0, #0
        54000369          bls     G_M31327_IG08
        F94017A0          ldr     x0, [fp,#40]  // [V01 loc1]
        91004000          add     x0, x0, #16
                                                ;; bbWeight=0.50 PerfScore 4.50
G_M31327_IG06:
        0C407000          ld1     {v0.8b}, [x0]
        FD000FA0          str     d0, [fp,#24]  // [V02 loc2]
        FD400FA0          ldr     d0, [fp,#24]  // [V02 loc2]
        0E205800          cnt     v0.8b, v0.8b
        0E31B800          addv    b0, v0.8b
        52800000          mov     w0, #0
        97FFE550          bl      System.Runtime.Intrinsics.Vector64:GetElement(System.Runtime.Intrinsics.Vector64`1[Byte],int):ubyte
        2A0003F3          mov     w19, w0
        D2863000          movz    x0, #0x3180
        F2ADA800          movk    x0, #0x6d40 LSL #16
        F2C02D20          movk    x0, #361 LSL #32
        F9400014          ldr     x20, [x0]
        910063A0          add     x0, fp, #24   // [V02 loc2]
        97FFE7DB          bl      System.Runtime.Intrinsics.Vector64`1[Byte][System.Byte]:ToString():System.String:this
        AA0003E1          mov     x1, x0
        AA1403E0          mov     x0, x20
        97FF522E          bl      System.String:Concat(System.String,System.String):System.String
        97FFE5CD          bl      System.Console:WriteLine(System.String)
        D2800000          mov     x0, #0
        F90017A0          str     x0, [fp,#40]  // [V01 loc1]
        2A1303E0          mov     w0, w19
                                                ;; bbWeight=1    PerfScore 21.00
G_M31327_IG07:
        A94353F3          ldp     x19, x20, [sp,#48]
        A8C47BFD          ldp     fp, lr, [sp],#64
        D65F03C0          ret     lr
                                                ;; bbWeight=1    PerfScore 3.00
G_M31327_IG08:
        97FF4D20          bl      CORINFO_HELP_RNGCHKFAIL
        D43E0000          bkpt
                                                ;; bbWeight=0    PerfScore 0.00

; Total bytes of code 216, prolog size 20, PerfScore 68.85, (MethodHash=48b785a0) for method projs.PopCountTester:Custom_PopCount():int
; ============================================================
public static int Custom2_PopCount() {
    int result;

    Vector64<uint> input = Vector64.CreateScalar(value);
    // Convert to byte variant
    Vector64<byte> inputData = input.AsByte<uint>();
    Vector64<byte> output = AdvSimd.PopCount(inputData);
    Vector64<byte> added = AdvSimd.Arm64.AddAcross(output);
    
    result = (int)Vector64.GetElement<byte>(added, 0);			
    return result;
}
Assembly code
; Assembly listing for method projs.PopCountTester:Custom2_PopCount():int
; Emitting BLENDED_CODE for generic ARM64 CPU - Windows
; optimized code
; fp based frame
; partially interruptible
; Final local variable assignments
;
;# V00 OutArgs      [V00    ] (  1,  1   )  lclBlk ( 0) [sp+0x00]   "OutgoingArgSpace"
;  V01 tmp1         [V01,T01] (  2,  4   )   simd8  ->  [fp+0x28]   HFA(double)  do-not-enreg[SF] "struct address for call/obj"
;  V02 tmp2         [V02,T02] (  3,  3   )   simd8  ->  [fp+0x18]   HFA(double)  do-not-enreg[SF] ld-addr-op "Inline stloc first use temp"
;  V03 tmp3         [V03,T00] (  2,  4   )     int  ->   x0         "Inlining Arg"
;  V04 tmp4         [V04,T03] (  2,  2   )   simd8  ->   d0         HFA(double)  ld-addr-op "Inline ldloca(s) first use temp"
;
; Lcl frame size = 32

G_M62221_IG01:
        A9BD7BFD          stp     fp, lr, [sp,#-48]!
        910003FD          mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M62221_IG02:
        D2933D80          movz    x0, #0x99ec
        F2B657E0          movk    x0, #0xb2bf LSL #16
        F2CFFEE0          movk    x0, #0x7ff7 LSL #32
        B9400000          ldr     w0, [x0]
        0E040FE0          dup     v0.2s, wzr
        FD000FA0          str     d0, [fp,#24]
        B9001BA0          str     w0, [fp,#24]
        FD400FA0          ldr     d0, [fp,#24]
        FD0017A0          str     d0, [fp,#40]
        FD4017A0          ldr     d0, [fp,#40]
        0E205800          cnt     v0.8b, v0.8b
        0E31B800          addv    b0, v0.8b
        52800000          mov     w0, #0
        97FFAE89          bl      System.Runtime.Intrinsics.Vector64:GetElement(System.Runtime.Intrinsics.Vector64`1[Byte],int):ubyte
                                                ;; bbWeight=1    PerfScore 18.00
G_M62221_IG03:
        A8C37BFD          ldp     fp, lr, [sp],#48
        D65F03C0          ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

; Total bytes of code 72, prolog size 8, PerfScore 28.70, (MethodHash=26090cf2) for method projs.PopCountTester:Custom2_PopCount():int
; ============================================================
|           Method |      Mean |     Error |    StdDev |    Median |       Min |       Max |  Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |----------:|----------:|----------:|----------:|----------:|----------:|-------:|------:|------:|----------:|
| Custom1_PopCount | 44.542 ns | 0.1219 ns | 0.1141 ns | 44.532 ns | 44.372 ns | 44.751 ns | 0.0007 |     - |     - |      32 B |
| Custom2_PopCount | 30.020 ns | 0.0035 ns | 0.0032 ns | 30.022 ns | 30.014 ns | 30.025 ns |      - |     - |     - |         - |
|  Actual_PopCount |  5.004 ns | 0.0016 ns | 0.0015 ns |  5.004 ns |  5.002 ns |  5.006 ns |      - |     - |     - |         - |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment