Skip to content

Instantly share code, notes, and snippets.

@cocowalla
Created May 8, 2019 10:32
Show Gist options
  • Save cocowalla/bb7e735b988a8cfec9770f1a0ee6a4a6 to your computer and use it in GitHub Desktop.
Save cocowalla/bb7e735b988a8cfec9770f1a0ee6a4a6 to your computer and use it in GitHub Desktop.
SIMD XOR Optimisation
; Core CLR v4.6.27615.73 (coreclr.dll) on amd64.
MyClass..ctor()
L0000: push rbp
L0001: sub rsp, 0x20
L0005: lea rbp, [rsp+0x20]
L000a: mov [rbp+0x10], rcx
L000e: cmp dword [rip+0xbffb], 0x0
L0015: jz L001c
L0017: call 0x7ffc61fad9e0
L001c: mov rcx, [rbp+0x10]
L0020: call System.Object..ctor()
L0025: nop
L0026: nop
L0027: lea rsp, [rbp]
L002b: pop rbp
L002c: ret
MyClass.MyMethod(Byte[])
L0000: push rbp
L0001: push rdi
L0002: push rsi
L0003: sub rsp, 0x170
L000a: vzeroupper
L000d: lea rbp, [rsp+0x180]
L0015: mov rsi, rcx
L0018: lea rdi, [rbp-0xc0]
L001f: mov ecx, 0x2c
L0024: xor eax, eax
L0026: rep stosd
L0028: mov rcx, rsi
L002b: mov [rbp+0x10], rcx
L002f: cmp dword [rip+0xbf8a], 0x0
L0036: jz L003d
L0038: call 0x7ffc61fad9e0
L003d: nop
L003e: mov rcx, 0x1d8e4e27c47d124f
L0048: mov [rsp+0x20], rcx
L004d: lea rcx, [rbp-0xf0]
L0054: mov rdx, 0xe7037ed1a0b428db
L005e: mov r8, 0x8ebc6af09c88c6e3
L0068: mov r9, 0x589965cc75374cc3
L0072: call System.Runtime.Intrinsics.Vector256.Create(UInt64, UInt64, UInt64, UInt64)
L0077: vmovupd ymm0, [rbp-0xf0]
L007f: vmovupd [rbp-0x30], ymm0
L0084: mov rax, [rbp+0x10]
L0088: mov [rbp-0x40], rax
L008c: cmp qword [rbp+0x10], 0x0
L0091: jz L009d
L0093: mov rax, [rbp-0x40]
L0097: cmp dword [rax+0x8], 0x0
L009b: jnz L00a8
L009d: xor eax, eax
L009f: mov eax, eax
L00a1: mov [rbp-0x38], rax
L00a5: nop
L00a6: jmp L00d1
L00a8: mov rax, [rbp-0x40]
L00ac: xor edx, edx
L00ae: cmp edx, [rax+0x8]
L00b1: jb L00b8
L00b3: call 0x7ffc61fafd40
L00b8: mov ecx, edx
L00ba: lea rax, [rax+rcx+0x10]
L00bf: mov [rbp-0x158], rax
L00c6: mov rax, [rbp-0x158]
L00cd: mov [rbp-0x38], rax
L00d1: nop
L00d2: mov rax, [rbp-0x38]
L00d6: mov [rbp-0x48], rax
L00da: xor eax, eax
L00dc: mov [rbp-0x4c], eax
L00df: nop
L00e0: jmp L01cc
L00e5: nop
L00e6: mov rax, [rbp-0x48]
L00ea: mov edx, [rbp-0x4c]
L00ed: movsxd rdx, edx
L00f0: vmovdqu ymm0, [rax+rdx]
L00f5: vmovupd [rbp-0x110], ymm0
L00fd: vmovupd ymm0, [rbp-0x110]
L0105: vmovupd [rbp-0x70], ymm0
L010a: vmovupd ymm0, [rbp-0x30]
L010f: vpxor xmm0, xmm0, [rbp-0x70]
L0114: vmovupd [rbp-0x130], ymm0
L011c: vmovupd ymm0, [rbp-0x130]
L0124: vmovupd [rbp-0x90], ymm0
L012c: vmovdqu ymm0, [rbp-0x90]
L0134: vmovd rax, xmm0
L0139: mov [rbp-0x138], rax
L0140: mov rax, [rbp-0x138]
L0147: mov [rbp-0x98], rax
L014e: vmovdqu ymm0, [rbp-0x90]
L0156: invalid
L015a: rol byte [rcx], 0x48
L015d: mov [rbp-0x140], eax
L0163: mov rax, [rbp-0x140]
L016a: mov [rbp-0xa0], rax
L0171: vmovupd ymm0, [rbp-0x90]
L0179: vextractf128 xmm0, ymm0, 0x1
L017f: vmovd rax, xmm0
L0184: mov [rbp-0x148], rax
L018b: mov rax, [rbp-0x148]
L0192: mov [rbp-0xa8], rax
L0199: vmovupd ymm0, [rbp-0x90]
L01a1: vextractf128 xmm0, ymm0, 0x1
L01a7: invalid
L01ab: rol byte [rcx], 0x48
L01ae: mov [rbp-0x150], eax
L01b4: mov rax, [rbp-0x150]
L01bb: mov [rbp-0xb0], rax
L01c2: nop
L01c3: mov eax, [rbp-0x4c]
L01c6: add eax, 0x20
L01c9: mov [rbp-0x4c], eax
L01cc: mov eax, [rbp-0x4c]
L01cf: add eax, 0x20
L01d2: mov rdx, [rbp+0x10]
L01d6: cmp eax, [rdx+0x8]
L01d9: setle al
L01dc: movzx eax, al
L01df: mov [rbp-0xb4], eax
L01e5: cmp dword [rbp-0xb4], 0x0
L01ec: jnz MyClass.MyMethod(Byte[])
L01f2: nop
L01f3: xor eax, eax
L01f5: mov [rbp-0x40], rax
L01f9: xor eax, eax
L01fb: movsxd rax, eax
L01fe: mov [rbp-0xc0], rax
L0205: nop
L0206: jmp L0208
L0208: mov rax, [rbp-0xc0]
L020f: vzeroupper
L0212: lea rsp, [rbp-0x10]
L0216: pop rsi
L0217: pop rdi
L0218: pop rbp
L0219: ret
using System;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
public class MyClass {
public static unsafe ulong MyMethod(byte[] array) {
var primeVector = Vector256.Create(0xe7037ed1a0b428db, 0x8ebc6af09c88c6e3, 0x589965cc75374cc3, 0x1d8e4e27c47d124f);
fixed (byte* pData = array)
{
byte* ptr = pData;
for (int i = 0; i + 32 <= array.Length; i += 32)
{
var vector = Avx.LoadVector256((ulong*)(ptr + i));
var res = Avx2.Xor(primeVector, vector);
ulong xor1 = res.GetElement(0);
ulong xor2 = res.GetElement(1);
ulong xor3 = res.GetElement(2);
ulong xor4 = res.GetElement(3);
}
}
return 0;
}
}
.class private auto ansi '<Module>'
{
} // end of class <Module>
.class public auto ansi beforefieldinit MyClass
extends [System.Private.CoreLib]System.Object
{
// Methods
.method public hidebysig static
uint64 MyMethod (
uint8[] 'array'
) cil managed
{
// Method begins at RVA 0x2050
// Code size 176 (0xb0)
.maxstack 4
.locals init (
[0] valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>,
[1] uint8*,
[2] uint8[] pinned,
[3] uint8*,
[4] int32,
[5] valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>,
[6] valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>,
[7] uint64,
[8] uint64,
[9] uint64,
[10] uint64,
[11] bool,
[12] uint64
)
IL_0000: nop
IL_0001: ldc.i8 -1800455987208640293
IL_000a: ldc.i8 -8161530843051276573
IL_0013: ldc.i8 6384245875588680899
IL_001c: ldc.i8 2129725606500045391
IL_0025: call valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64> [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::Create(uint64, uint64, uint64, uint64)
IL_002a: stloc.0
IL_002b: ldarg.0
IL_002c: dup
IL_002d: stloc.2
IL_002e: brfalse.s IL_0035
IL_0030: ldloc.2
IL_0031: ldlen
IL_0032: conv.i4
IL_0033: brtrue.s IL_003a
IL_0035: ldc.i4.0
IL_0036: conv.u
IL_0037: stloc.1
IL_0038: br.s IL_0043
IL_003a: ldloc.2
IL_003b: ldc.i4.0
IL_003c: ldelema [System.Private.CoreLib]System.Byte
IL_0041: conv.u
IL_0042: stloc.1
IL_0043: nop
IL_0044: ldloc.1
IL_0045: stloc.3
IL_0046: ldc.i4.0
IL_0047: stloc.s 4
// sequence point: hidden
IL_0049: br.s IL_0091
// loop start (head: IL_0091)
IL_004b: nop
IL_004c: ldloc.3
IL_004d: ldloc.s 4
IL_004f: add
IL_0050: call valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64> [System.Private.CoreLib]System.Runtime.Intrinsics.X86.Avx::LoadVector256(uint64*)
IL_0055: stloc.s 5
IL_0057: ldloc.0
IL_0058: ldloc.s 5
IL_005a: call valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64> [System.Private.CoreLib]System.Runtime.Intrinsics.X86.Avx2::Xor(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>, valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>)
IL_005f: stloc.s 6
IL_0061: ldloc.s 6
IL_0063: ldc.i4.0
IL_0064: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32)
IL_0069: stloc.s 7
IL_006b: ldloc.s 6
IL_006d: ldc.i4.1
IL_006e: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32)
IL_0073: stloc.s 8
IL_0075: ldloc.s 6
IL_0077: ldc.i4.2
IL_0078: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32)
IL_007d: stloc.s 9
IL_007f: ldloc.s 6
IL_0081: ldc.i4.3
IL_0082: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32)
IL_0087: stloc.s 10
IL_0089: nop
IL_008a: ldloc.s 4
IL_008c: ldc.i4.s 32
IL_008e: add
IL_008f: stloc.s 4
IL_0091: ldloc.s 4
IL_0093: ldc.i4.s 32
IL_0095: add
IL_0096: ldarg.0
IL_0097: ldlen
IL_0098: conv.i4
IL_0099: cgt
IL_009b: ldc.i4.0
IL_009c: ceq
IL_009e: stloc.s 11
// sequence point: hidden
IL_00a0: ldloc.s 11
IL_00a2: brtrue.s IL_004b
// end loop
IL_00a4: nop
// sequence point: hidden
IL_00a5: ldnull
IL_00a6: stloc.2
IL_00a7: ldc.i4.0
IL_00a8: conv.i8
IL_00a9: stloc.s 12
IL_00ab: br.s IL_00ad
IL_00ad: ldloc.s 12
IL_00af: ret
} // end of method MyClass::MyMethod
.method public hidebysig specialname rtspecialname
instance void .ctor () cil managed
{
// Method begins at RVA 0x210c
// Code size 8 (0x8)
.maxstack 8
IL_0000: ldarg.0
IL_0001: call instance void [System.Private.CoreLib]System.Object::.ctor()
IL_0006: nop
IL_0007: ret
} // end of method MyClass::.ctor
} // end of class MyClass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment