Skip to content

Instantly share code, notes, and snippets.

@tannergooding
Last active August 6, 2016 17:41
Show Gist options
  • Save tannergooding/08702b99b26447b9e30e2126bba2c966 to your computer and use it in GitHub Desktop.
Save tannergooding/08702b99b26447b9e30e2126bba2c966 to your computer and use it in GitHub Desktop.
MemMove using XMM and optimized block copy.
00007FF953D04AC3 mov rdx,r8
00007FF953D04AC6 cmp rdx,10h
00007FF953D04ACA ja 00007FF953D04C43
00007FF953D04AD0 movsxd rdx,edx
00007FF953D04AD3 cmp rdx,10h
00007FF953D04AD7 ja 00007FF953D04C43
00007FF953D04ADD lea r8,[7FF953D04E18h]
00007FF953D04AE4 mov r8d,dword ptr [r8+rdx*4]
00007FF953D04AE8 lea rax,[7FF953D04AC3h]
00007FF953D04AEF add r8,rax
00007FF953D04AF2 jmp r8
00007FF953D04AF5 jmp 00007FF953D04E11
00007FF953D04AFA movzx r9d,byte ptr [r9]
00007FF953D04AFE mov byte ptr [rcx],r9b
00007FF953D04B01 jmp 00007FF953D04E11
00007FF953D04B06 movzx r9d,word ptr [r9]
00007FF953D04B0A mov word ptr [rcx],r9w
00007FF953D04B0E jmp 00007FF953D04E11
00007FF953D04B13 movzx edx,word ptr [r9]
00007FF953D04B17 mov word ptr [rcx],dx
00007FF953D04B1A movzx r9d,byte ptr [r9+2]
00007FF953D04B1F mov byte ptr [rcx+2],r9b
00007FF953D04B23 jmp 00007FF953D04E11
00007FF953D04B28 mov r9d,dword ptr [r9]
00007FF953D04B2B mov dword ptr [rcx],r9d
00007FF953D04B2E jmp 00007FF953D04E11
00007FF953D04B33 mov edx,dword ptr [r9]
00007FF953D04B36 mov dword ptr [rcx],edx
00007FF953D04B38 movzx r9d,byte ptr [r9+4]
00007FF953D04B3D mov byte ptr [rcx+4],r9b
00007FF953D04B41 jmp 00007FF953D04E11
00007FF953D04B46 mov edx,dword ptr [r9]
00007FF953D04B49 mov dword ptr [rcx],edx
00007FF953D04B4B movzx r9d,word ptr [r9+4]
00007FF953D04B50 mov word ptr [rcx+4],r9w
00007FF953D04B55 jmp 00007FF953D04E11
00007FF953D04B5A mov edx,dword ptr [r9]
00007FF953D04B5D mov dword ptr [rcx],edx
00007FF953D04B5F movzx edx,word ptr [r9+4]
00007FF953D04B64 mov word ptr [rcx+4],dx
00007FF953D04B68 movzx r9d,byte ptr [r9+6]
00007FF953D04B6D mov byte ptr [rcx+6],r9b
00007FF953D04B71 jmp 00007FF953D04E11
00007FF953D04B76 mov r9,qword ptr [r9]
00007FF953D04B79 mov qword ptr [rcx],r9
00007FF953D04B7C jmp 00007FF953D04E11
00007FF953D04B81 mov rdx,qword ptr [r9]
00007FF953D04B84 mov qword ptr [rcx],rdx
00007FF953D04B87 movzx r9d,byte ptr [r9+8]
00007FF953D04B8C mov byte ptr [rcx+8],r9b
00007FF953D04B90 jmp 00007FF953D04E11
00007FF953D04B95 mov rdx,qword ptr [r9]
00007FF953D04B98 mov qword ptr [rcx],rdx
00007FF953D04B9B movzx r9d,word ptr [r9+8]
00007FF953D04BA0 mov word ptr [rcx+8],r9w
00007FF953D04BA5 jmp 00007FF953D04E11
00007FF953D04BAA mov rdx,qword ptr [r9]
00007FF953D04BAD mov qword ptr [rcx],rdx
00007FF953D04BB0 movzx edx,word ptr [r9+8]
00007FF953D04BB5 mov word ptr [rcx+8],dx
00007FF953D04BB9 movzx r9d,byte ptr [r9+0Ah]
00007FF953D04BBE mov byte ptr [rcx+0Ah],r9b
00007FF953D04BC2 jmp 00007FF953D04E11
00007FF953D04BC7 mov rdx,qword ptr [r9]
00007FF953D04BCA mov qword ptr [rcx],rdx
00007FF953D04BCD mov r9d,dword ptr [r9+8]
00007FF953D04BD1 mov dword ptr [rcx+8],r9d
00007FF953D04BD5 jmp 00007FF953D04E11
00007FF953D04BDA mov rdx,qword ptr [r9]
00007FF953D04BDD mov qword ptr [rcx],rdx
00007FF953D04BE0 mov edx,dword ptr [r9+8]
00007FF953D04BE4 mov dword ptr [rcx+8],edx
00007FF953D04BE7 movzx r9d,byte ptr [r9+0Ch]
00007FF953D04BEC mov byte ptr [rcx+0Ch],r9b
00007FF953D04BF0 jmp 00007FF953D04E11
00007FF953D04BF5 mov rdx,qword ptr [r9]
00007FF953D04BF8 mov qword ptr [rcx],rdx
00007FF953D04BFB mov edx,dword ptr [r9+8]
00007FF953D04BFF mov dword ptr [rcx+8],edx
00007FF953D04C02 movzx r9d,word ptr [r9+0Ch]
00007FF953D04C07 mov word ptr [rcx+0Ch],r9w
00007FF953D04C0C jmp 00007FF953D04E11
00007FF953D04C11 mov rdx,qword ptr [r9]
00007FF953D04C14 mov qword ptr [rcx],rdx
00007FF953D04C17 mov edx,dword ptr [r9+8]
00007FF953D04C1B mov dword ptr [rcx+8],edx
00007FF953D04C1E movzx edx,word ptr [r9+0Ch]
00007FF953D04C23 mov word ptr [rcx+0Ch],dx
00007FF953D04C27 movzx r9d,byte ptr [r9+0Eh]
00007FF953D04C2C mov byte ptr [rcx+0Eh],r9b
00007FF953D04C30 jmp 00007FF953D04E11
00007FF953D04C35 movdqu xmm0,xmmword ptr [r9]
00007FF953D04C3A movdqu xmmword ptr [rcx],xmm0
00007FF953D04C3E jmp 00007FF953D04E11
00007FF953D04C43 cmp r8,20h
00007FF953D04C47 ja 00007FF953D04C69
00007FF953D04C49 movdqu xmm0,xmmword ptr [r9]
00007FF953D04C4E movdqu xmmword ptr [rcx],xmm0
00007FF953D04C52 lea rdx,[r9+r8-10h]
00007FF953D04C57 lea rcx,[rcx+r8-10h]
00007FF953D04C5C movdqu xmm0,xmmword ptr [rdx]
00007FF953D04C60 movdqu xmmword ptr [rcx],xmm0
00007FF953D04C64 jmp 00007FF953D04E11
00007FF953D04C69 mov rax,rcx
00007FF953D04C6C xor edx,edx
00007FF953D04C6E mov rdx,rcx
00007FF953D04C71 and rdx,0Fh
00007FF953D04C75 test rdx,rdx
00007FF953D04C78 je 00007FF953D04CA1
00007FF953D04C7A movdqu xmm0,xmmword ptr [r9]
00007FF953D04C7F movdqu xmmword ptr [rcx],xmm0
00007FF953D04C83 lea rax,[r9+rdx]
00007FF953D04C87 lea r10,[rcx+rdx]
00007FF953D04C8B movdqu xmm0,xmmword ptr [rax]
00007FF953D04C8F movdqu xmmword ptr [r10],xmm0
00007FF953D04C94 add rdx,10h
00007FF953D04C98 sub r8,rdx
00007FF953D04C9B add r9,rdx
00007FF953D04C9E add rcx,rdx
00007FF953D04CA1 cmp r8,40h
00007FF953D04CA5 jbe 00007FF953D04CFB
00007FF953D04CA7 mov rax,r8
00007FF953D04CAA xor edx,edx
00007FF953D04CAC mov rdx,r8
00007FF953D04CAF shr rdx,6
00007FF953D04CB3 xor eax,eax
00007FF953D04CB5 test rdx,rdx
00007FF953D04CB8 jbe 00007FF953D04CF4
00007FF953D04CBA movdqu xmm0,xmmword ptr [r9]
00007FF953D04CBF movdqu xmmword ptr [rcx],xmm0
00007FF953D04CC3 movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04CC9 movdqu xmmword ptr [rcx+10h],xmm0
00007FF953D04CCE movdqu xmm0,xmmword ptr [r9+20h]
00007FF953D04CD4 movdqu xmmword ptr [rcx+20h],xmm0
00007FF953D04CD9 movdqu xmm0,xmmword ptr [r9+30h]
00007FF953D04CDF movdqu xmmword ptr [rcx+30h],xmm0
00007FF953D04CE4 add r9,40h
00007FF953D04CE8 add rcx,40h
00007FF953D04CEC inc rax
00007FF953D04CEF cmp rax,rdx
00007FF953D04CF2 jb 00007FF953D04CBA
00007FF953D04CF4 shl rdx,6
00007FF953D04CF8 sub r8,rdx
00007FF953D04CFB test r8,r8
00007FF953D04CFE jne 00007FF953D04D05
00007FF953D04D00 jmp 00007FF953D04E11
00007FF953D04D05 mov rax,r8
00007FF953D04D08 xor edx,edx
00007FF953D04D0A mov rax,r8
00007FF953D04D0D shr rax,4
00007FF953D04D11 mov rdx,rax
00007FF953D04D14 shl rdx,4
00007FF953D04D18 sub r8,rdx
00007FF953D04D1B mov rdx,r8
00007FF953D04D1E cmp rax,4
00007FF953D04D22 jbe 00007FF953D04D29
00007FF953D04D24 jmp 00007FF953D04E11
00007FF953D04D29 movsxd rax,eax
00007FF953D04D2C cmp rax,4
00007FF953D04D30 ja 00007FF953D04D4A
00007FF953D04D32 lea r8,[7FF953D04E5Ch]
00007FF953D04D39 mov r8d,dword ptr [r8+rax*4]
00007FF953D04D3D lea r10,[7FF953D04AC3h]
00007FF953D04D44 add r8,r10
00007FF953D04D47 jmp r8
00007FF953D04D4A jmp 00007FF953D04E11
00007FF953D04D4F lea r9,[r9+rdx-10h]
00007FF953D04D54 lea rcx,[rcx+rdx-10h]
00007FF953D04D59 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D5E movdqu xmmword ptr [rcx],xmm0
00007FF953D04D62 jmp 00007FF953D04E11
00007FF953D04D67 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D6C movdqu xmmword ptr [rcx],xmm0
00007FF953D04D70 add r9,rdx
00007FF953D04D73 add rcx,rdx
00007FF953D04D76 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D7B movdqu xmmword ptr [rcx],xmm0
00007FF953D04D7F jmp 00007FF953D04E11
00007FF953D04D84 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D89 movdqu xmmword ptr [rcx],xmm0
00007FF953D04D8D movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04D93 movdqu xmmword ptr [rcx+10h],xmm0
00007FF953D04D98 lea r9,[r9+rdx+10h]
00007FF953D04D9D lea rcx,[rcx+rdx+10h]
00007FF953D04DA2 movdqu xmm0,xmmword ptr [r9]
00007FF953D04DA7 movdqu xmmword ptr [rcx],xmm0
00007FF953D04DAB jmp 00007FF953D04E11
00007FF953D04DAD movdqu xmm0,xmmword ptr [r9]
00007FF953D04DB2 movdqu xmmword ptr [rcx],xmm0
00007FF953D04DB6 movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04DBC movdqu xmmword ptr [rcx+10h],xmm0
00007FF953D04DC1 lea rax,[r9+20h]
00007FF953D04DC5 lea r8,[rcx+20h]
00007FF953D04DC9 movdqu xmm0,xmmword ptr [rax]
00007FF953D04DCD movdqu xmmword ptr [r8],xmm0
00007FF953D04DD2 lea r9,[r9+rdx+20h]
00007FF953D04DD7 lea rcx,[rcx+rdx+20h]
00007FF953D04DDC movdqu xmm0,xmmword ptr [r9]
00007FF953D04DE1 movdqu xmmword ptr [rcx],xmm0
00007FF953D04DE5 jmp 00007FF953D04E11
00007FF953D04DE7 movdqu xmm0,xmmword ptr [r9]
00007FF953D04DEC movdqu xmmword ptr [rcx],xmm0
00007FF953D04DF0 movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04DF6 movdqu xmmword ptr [rcx+10h],xmm0
00007FF953D04DFB movdqu xmm0,xmmword ptr [r9+20h]
00007FF953D04E01 movdqu xmmword ptr [rcx+20h],xmm0
00007FF953D04E06 movdqu xmm0,xmmword ptr [r9+30h]
00007FF953D04E0C movdqu xmmword ptr [rcx+30h],xmm0
00007FF953D04E11 ret
switch (len)
00007FF953D04AC3 mov rdx,r8
00007FF953D04AC6 cmp rdx,10h
00007FF953D04ACA ja 00007FF953D04C43
00007FF953D04AD0 movsxd rdx,edx
00007FF953D04AD3 cmp rdx,10h
00007FF953D04AD7 ja 00007FF953D04C43
00007FF953D04ADD lea r8,[7FF953D04E18h]
00007FF953D04AE4 mov r8d,dword ptr [r8+rdx*4]
00007FF953D04AE8 lea rax,[7FF953D04AC3h]
00007FF953D04AEF add r8,rax
00007FF953D04AF2 jmp r8
{
case 0:
{
return;
00007FF953D04AF5 jmp 00007FF953D04E11
}
case 1:
{
*dst = *src;
00007FF953D04AFA movzx r9d,byte ptr [r9]
00007FF953D04AFE mov byte ptr [rcx],r9b
return;
00007FF953D04B01 jmp 00007FF953D04E11
}
case 2:
{
*(ushort*)(dst) = *(ushort*)(src);
00007FF953D04B06 movzx r9d,word ptr [r9]
00007FF953D04B0A mov word ptr [rcx],r9w
return;
00007FF953D04B0E jmp 00007FF953D04E11
}
case 3:
{
*(ushort*)(dst) = *(ushort*)(src);
00007FF953D04B13 movzx edx,word ptr [r9]
00007FF953D04B17 mov word ptr [rcx],dx
*(dst + sizeof(ushort)) = *(src + sizeof(ushort));
00007FF953D04B1A movzx r9d,byte ptr [r9+2]
00007FF953D04B1F mov byte ptr [rcx+2],r9b
return;
00007FF953D04B23 jmp 00007FF953D04E11
}
case 4:
{
*(uint*)(dst) = *(uint*)(src);
00007FF953D04B28 mov r9d,dword ptr [r9]
}
case 4:
{
*(uint*)(dst) = *(uint*)(src);
00007FF953D04B2B mov dword ptr [rcx],r9d
return;
00007FF953D04B2E jmp 00007FF953D04E11
}
case 5:
{
*(uint*)(dst) = *(uint*)(src);
00007FF953D04B33 mov edx,dword ptr [r9]
00007FF953D04B36 mov dword ptr [rcx],edx
*(dst + sizeof(uint)) = *(src + sizeof(uint));
00007FF953D04B38 movzx r9d,byte ptr [r9+4]
00007FF953D04B3D mov byte ptr [rcx+4],r9b
return;
00007FF953D04B41 jmp 00007FF953D04E11
}
case 6:
{
*(uint*)(dst) = *(uint*)(src);
00007FF953D04B46 mov edx,dword ptr [r9]
00007FF953D04B49 mov dword ptr [rcx],edx
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
00007FF953D04B4B movzx r9d,word ptr [r9+4]
00007FF953D04B50 mov word ptr [rcx+4],r9w
return;
00007FF953D04B55 jmp 00007FF953D04E11
}
case 7:
{
*(uint*)(dst) = *(uint*)(src);
00007FF953D04B5A mov edx,dword ptr [r9]
00007FF953D04B5D mov dword ptr [rcx],edx
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
00007FF953D04B5F movzx edx,word ptr [r9+4]
00007FF953D04B64 mov word ptr [rcx+4],dx
*(dst + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(uint) + sizeof(ushort));
00007FF953D04B68 movzx r9d,byte ptr [r9+6]
00007FF953D04B6D mov byte ptr [rcx+6],r9b
return;
00007FF953D04B71 jmp 00007FF953D04E11
}
case 8:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04B76 mov r9,qword ptr [r9]
00007FF953D04B79 mov qword ptr [rcx],r9
return;
00007FF953D04B7C jmp 00007FF953D04E11
}
case 9:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04B81 mov rdx,qword ptr [r9]
00007FF953D04B84 mov qword ptr [rcx],rdx
*(dst + sizeof(ulong)) = *(src + sizeof(ulong));
00007FF953D04B87 movzx r9d,byte ptr [r9+8]
00007FF953D04B8C mov byte ptr [rcx+8],r9b
return;
00007FF953D04B90 jmp 00007FF953D04E11
}
case 10:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04B95 mov rdx,qword ptr [r9]
00007FF953D04B98 mov qword ptr [rcx],rdx
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
00007FF953D04B9B movzx r9d,word ptr [r9+8]
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
00007FF953D04BA0 mov word ptr [rcx+8],r9w
return;
00007FF953D04BA5 jmp 00007FF953D04E11
}
case 11:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04BAA mov rdx,qword ptr [r9]
00007FF953D04BAD mov qword ptr [rcx],rdx
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
00007FF953D04BB0 movzx edx,word ptr [r9+8]
00007FF953D04BB5 mov word ptr [rcx+8],dx
*(dst + sizeof(ulong) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(ushort));
00007FF953D04BB9 movzx r9d,byte ptr [r9+0Ah]
00007FF953D04BBE mov byte ptr [rcx+0Ah],r9b
return;
00007FF953D04BC2 jmp 00007FF953D04E11
}
case 12:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04BC7 mov rdx,qword ptr [r9]
00007FF953D04BCA mov qword ptr [rcx],rdx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00007FF953D04BCD mov r9d,dword ptr [r9+8]
00007FF953D04BD1 mov dword ptr [rcx+8],r9d
return;
00007FF953D04BD5 jmp 00007FF953D04E11
}
case 13:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04BDA mov rdx,qword ptr [r9]
00007FF953D04BDD mov qword ptr [rcx],rdx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00007FF953D04BE0 mov edx,dword ptr [r9+8]
00007FF953D04BE4 mov dword ptr [rcx+8],edx
*(dst + sizeof(ulong) + sizeof(uint)) = *(src + sizeof(ulong) + sizeof(uint));
00007FF953D04BE7 movzx r9d,byte ptr [r9+0Ch]
00007FF953D04BEC mov byte ptr [rcx+0Ch],r9b
return;
00007FF953D04BF0 jmp 00007FF953D04E11
}
case 14:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04BF5 mov rdx,qword ptr [r9]
00007FF953D04BF8 mov qword ptr [rcx],rdx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00007FF953D04BFB mov edx,dword ptr [r9+8]
00007FF953D04BFF mov dword ptr [rcx+8],edx
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
00007FF953D04C02 movzx r9d,word ptr [r9+0Ch]
00007FF953D04C07 mov word ptr [rcx+0Ch],r9w
return;
00007FF953D04C0C jmp 00007FF953D04E11
}
case 15:
{
*(ulong*)(dst) = *(ulong*)(src);
00007FF953D04C11 mov rdx,qword ptr [r9]
00007FF953D04C14 mov qword ptr [rcx],rdx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00007FF953D04C17 mov edx,dword ptr [r9+8]
00007FF953D04C1B mov dword ptr [rcx+8],edx
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
00007FF953D04C1E movzx edx,word ptr [r9+0Ch]
00007FF953D04C23 mov word ptr [rcx+0Ch],dx
*(dst + sizeof(ulong) + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(uint) + sizeof(ushort));
00007FF953D04C27 movzx r9d,byte ptr [r9+0Eh]
00007FF953D04C2C mov byte ptr [rcx+0Eh],r9b
return;
00007FF953D04C30 jmp 00007FF953D04E11
}
case 16:
{
*(UInt128*)(dst) = *(UInt128*)(src);
00007FF953D04C35 movdqu xmm0,xmmword ptr [r9]
00007FF953D04C3A movdqu xmmword ptr [rcx],xmm0
return;
00007FF953D04C3E jmp 00007FF953D04E11
}
}
if (len <= 32)
00007FF953D04C43 cmp r8,20h
00007FF953D04C47 ja 00007FF953D04C69
{
// We can do this in two writes. Note that one or both of these writes may be misaligned
*(UInt128*)(dst) = *(UInt128*)(src);
00007FF953D04C49 movdqu xmm0,xmmword ptr [r9]
00007FF953D04C4E movdqu xmmword ptr [rcx],xmm0
*(UInt128*)(dst + len - sizeof_UInt128) = *(UInt128*)(src + len - sizeof_UInt128);
00007FF953D04C52 lea rdx,[r9+r8-10h]
00007FF953D04C57 lea rcx,[rcx+r8-10h]
00007FF953D04C5C movdqu xmm0,xmmword ptr [rdx]
00007FF953D04C60 movdqu xmmword ptr [rcx],xmm0
return;
00007FF953D04C64 jmp 00007FF953D04E11
}
var misalignment = ((nuint)(dst) % sizeof_UInt128);
00007FF953D04C69 mov rax,rcx
00007FF953D04C6C xor edx,edx
00007FF953D04C6E mov rdx,rcx
00007FF953D04C71 and rdx,0Fh
if (misalignment != 0)
00007FF953D04C75 test rdx,rdx
00007FF953D04C78 je 00007FF953D04CA1
{
*(UInt128*)(dst) = *(UInt128*)(src);
00007FF953D04C7A movdqu xmm0,xmmword ptr [r9]
00007FF953D04C7F movdqu xmmword ptr [rcx],xmm0
*(UInt128*)(dst + misalignment) = *(UInt128*)(src + misalignment);
00007FF953D04C83 lea rax,[r9+rdx]
00007FF953D04C87 lea r10,[rcx+rdx]
00007FF953D04C8B movdqu xmm0,xmmword ptr [rax]
00007FF953D04C8F movdqu xmmword ptr [r10],xmm0
var initialOffset = (sizeof_UInt128 + misalignment);
00007FF953D04C94 add rdx,10h
len -= initialOffset;
00007FF953D04C98 sub r8,rdx
src += initialOffset;
00007FF953D04C9B add r9,rdx
dst += initialOffset;
00007FF953D04C9E add rcx,rdx
}
#if BIT64
const nuint blockSize = sizeof_UInt512;
#else
const nuint blockSize = sizeof_UInt256;
#endif
if (len > blockSize)
00007FF953D04CA1 cmp r8,40h
00007FF953D04CA5 jbe 00007FF953D04CFB
{
var iterations = (len / blockSize);
00007FF953D04CA7 mov rax,r8
{
var iterations = (len / blockSize);
00007FF953D04CAA xor edx,edx
00007FF953D04CAC mov rdx,r8
00007FF953D04CAF shr rdx,6
for (var iteration = 0ul; iteration < iterations; iteration++)
00007FF953D04CB3 xor eax,eax
for (var iteration = 0ul; iteration < iterations; iteration++)
00007FF953D04CB5 test rdx,rdx
00007FF953D04CB8 jbe 00007FF953D04CF4
{
#if BIT64
*(UInt512*)(dst) = *(UInt512*)(src);
00007FF953D04CBA movdqu xmm0,xmmword ptr [r9]
00007FF953D04CBF movdqu xmmword ptr [rcx],xmm0
00007FF953D04CC3 movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04CC9 movdqu xmmword ptr [rcx+10h],xmm0
00007FF953D04CCE movdqu xmm0,xmmword ptr [r9+20h]
00007FF953D04CD4 movdqu xmmword ptr [rcx+20h],xmm0
00007FF953D04CD9 movdqu xmm0,xmmword ptr [r9+30h]
00007FF953D04CDF movdqu xmmword ptr [rcx+30h],xmm0
#else
*(UInt256*)(dst) = *(UInt256*)(src);
#endif
src += blockSize;
00007FF953D04CE4 add r9,40h
dst += blockSize;
00007FF953D04CE8 add rcx,40h
for (var iteration = 0ul; iteration < iterations; iteration++)
00007FF953D04CEC inc rax
00007FF953D04CEF cmp rax,rdx
00007FF953D04CF2 jb 00007FF953D04CBA
}
len -= (iterations * blockSize);
00007FF953D04CF4 shl rdx,6
00007FF953D04CF8 sub r8,rdx
}
if (len == 0)
00007FF953D04CFB test r8,r8
00007FF953D04CFE jne 00007FF953D04D05
{
return;
00007FF953D04D00 jmp 00007FF953D04E11
}
var remainingBlocks = (len / sizeof_UInt128);
00007FF953D04D05 mov rax,r8
00007FF953D04D08 xor edx,edx
00007FF953D04D0A mov rax,r8
00007FF953D04D0D shr rax,4
var remainingBytes = (len - (remainingBlocks * sizeof_UInt128));
00007FF953D04D11 mov rdx,rax
00007FF953D04D14 shl rdx,4
00007FF953D04D18 sub r8,rdx
00007FF953D04D1B mov rdx,r8
switch (remainingBlocks)
00007FF953D04D1E cmp rax,4
00007FF953D04D22 jbe 00007FF953D04D29
00007FF953D04D24 jmp 00007FF953D04E11
00007FF953D04D29 movsxd rax,eax
00007FF953D04D2C cmp rax,4
00007FF953D04D30 ja 00007FF953D04D4A
00007FF953D04D32 lea r8,[7FF953D04E5Ch]
00007FF953D04D39 mov r8d,dword ptr [r8+rax*4]
00007FF953D04D3D lea r10,[7FF953D04AC3h]
00007FF953D04D44 add r8,r10
00007FF953D04D47 jmp r8
00007FF953D04D4A jmp 00007FF953D04E11
{
case 0:
{
*(UInt128*)(dst - sizeof_UInt128 + remainingBytes) = *(UInt128*)(src - sizeof_UInt128 + remainingBytes);
00007FF953D04D4F lea r9,[r9+rdx-10h]
00007FF953D04D54 lea rcx,[rcx+rdx-10h]
00007FF953D04D59 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D5E movdqu xmmword ptr [rcx],xmm0
return;
00007FF953D04D62 jmp 00007FF953D04E11
}
case 1:
{
*(UInt128*)(dst) = *(UInt128*)(src);
00007FF953D04D67 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D6C movdqu xmmword ptr [rcx],xmm0
*(UInt128*)(dst + remainingBytes) = *(UInt128*)(src + remainingBytes);
00007FF953D04D70 add r9,rdx
00007FF953D04D73 add rcx,rdx
00007FF953D04D76 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D7B movdqu xmmword ptr [rcx],xmm0
return;
00007FF953D04D7F jmp 00007FF953D04E11
}
#if BIT64
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
00007FF953D04D84 movdqu xmm0,xmmword ptr [r9]
00007FF953D04D89 movdqu xmmword ptr [rcx],xmm0
00007FF953D04D8D movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04D93 movdqu xmmword ptr [rcx+10h],xmm0
*(UInt128*)(dst + sizeof_UInt128 + remainingBytes) = *(UInt128*)(src + sizeof_UInt128 + remainingBytes);
00007FF953D04D98 lea r9,[r9+rdx+10h]
00007FF953D04D9D lea rcx,[rcx+rdx+10h]
00007FF953D04DA2 movdqu xmm0,xmmword ptr [r9]
00007FF953D04DA7 movdqu xmmword ptr [rcx],xmm0
return;
00007FF953D04DAB jmp 00007FF953D04E11
}
case 3:
{
*(UInt256*)(dst) = *(UInt256*)(src);
00007FF953D04DAD movdqu xmm0,xmmword ptr [r9]
00007FF953D04DB2 movdqu xmmword ptr [rcx],xmm0
00007FF953D04DB6 movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04DBC movdqu xmmword ptr [rcx+10h],xmm0
*(UInt128*)(dst + sizeof_UInt256) = *(UInt128*)(src + sizeof_UInt256);
00007FF953D04DC1 lea rax,[r9+20h]
00007FF953D04DC5 lea r8,[rcx+20h]
00007FF953D04DC9 movdqu xmm0,xmmword ptr [rax]
00007FF953D04DCD movdqu xmmword ptr [r8],xmm0
*(UInt128*)(dst + sizeof_UInt256 + remainingBytes) = *(UInt128*)(src + sizeof_UInt256 + remainingBytes);
00007FF953D04DD2 lea r9,[r9+rdx+20h]
00007FF953D04DD7 lea rcx,[rcx+rdx+20h]
00007FF953D04DDC movdqu xmm0,xmmword ptr [r9]
00007FF953D04DE1 movdqu xmmword ptr [rcx],xmm0
return;
00007FF953D04DE5 jmp 00007FF953D04E11
}
case 4:
{
*(UInt512*)(dst) = *(UInt512*)(src);
00007FF953D04DE7 movdqu xmm0,xmmword ptr [r9]
00007FF953D04DEC movdqu xmmword ptr [rcx],xmm0
00007FF953D04DF0 movdqu xmm0,xmmword ptr [r9+10h]
00007FF953D04DF6 movdqu xmmword ptr [rcx+10h],xmm0
00007FF953D04DFB movdqu xmm0,xmmword ptr [r9+20h]
00007FF953D04E01 movdqu xmmword ptr [rcx+20h],xmm0
00007FF953D04E06 movdqu xmm0,xmmword ptr [r9+30h]
00007FF953D04E0C movdqu xmmword ptr [rcx+30h],xmm0
00007FF953D04E11 ret
00AA2FA4 push esi
00AA2FA5 push ebx
00AA2FA6 sub esp,18h
00AA2FA9 mov dword ptr [ebp-10h],edx
00AA2FAC mov ebx,ecx
00AA2FAE mov eax,dword ptr [ebp+8]
00AA2FB1 cmp eax,11h
00AA2FB4 jae 00AA3177
00AA2FBA jmp dword ptr [eax*4+0AA3320h]
00AA2FC1 jmp 00AA3311
00AA2FC6 mov eax,dword ptr [ebp-10h]
00AA2FC9 movzx eax,byte ptr [eax]
00AA2FCC mov byte ptr [ebx],al
00AA2FCE jmp 00AA3311
00AA2FD3 mov eax,dword ptr [ebp-10h]
00AA2FD6 movzx eax,word ptr [eax]
00AA2FD9 mov word ptr [ebx],ax
00AA2FDC jmp 00AA3311
00AA2FE1 mov eax,dword ptr [ebp-10h]
00AA2FE4 movzx eax,word ptr [eax]
00AA2FE7 mov word ptr [ebx],ax
00AA2FEA mov eax,dword ptr [ebp-10h]
00AA2FED movzx eax,byte ptr [eax+2]
00AA2FF1 mov byte ptr [ebx+2],al
00AA2FF4 jmp 00AA3311
00AA2FF9 mov eax,dword ptr [ebp-10h]
00AA2FFC mov eax,dword ptr [eax]
00AA2FFE mov dword ptr [ebx],eax
00AA3000 jmp 00AA3311
00AA3005 mov eax,dword ptr [ebp-10h]
00AA3008 mov eax,dword ptr [eax]
00AA300A mov dword ptr [ebx],eax
00AA300C mov eax,dword ptr [ebp-10h]
00AA300F movzx eax,byte ptr [eax+4]
00AA3013 mov byte ptr [ebx+4],al
00AA3016 jmp 00AA3311
00AA301B mov eax,dword ptr [ebp-10h]
00AA301E mov eax,dword ptr [eax]
00AA3020 mov dword ptr [ebx],eax
00AA3022 mov eax,dword ptr [ebp-10h]
00AA3025 movzx eax,word ptr [eax+4]
00AA3029 mov word ptr [ebx+4],ax
00AA302D jmp 00AA3311
00AA3032 mov eax,dword ptr [ebp-10h]
00AA3035 mov eax,dword ptr [eax]
00AA3037 mov dword ptr [ebx],eax
00AA3039 mov eax,dword ptr [ebp-10h]
00AA303C movzx eax,word ptr [eax+4]
00AA3040 mov word ptr [ebx+4],ax
00AA3044 mov eax,dword ptr [ebp-10h]
00AA3047 movzx eax,byte ptr [eax+6]
00AA304B mov byte ptr [ebx+6],al
00AA304E jmp 00AA3311
00AA3053 mov eax,dword ptr [ebp-10h]
00AA3056 mov edx,dword ptr [eax+4]
00AA3059 mov eax,dword ptr [eax]
00AA305B mov dword ptr [ebx],eax
00AA305D mov dword ptr [ebx+4],edx
00AA3060 jmp 00AA3311
00AA3065 mov eax,dword ptr [ebp-10h]
00AA3068 mov edx,dword ptr [eax+4]
00AA306B mov eax,dword ptr [eax]
00AA306D mov dword ptr [ebx],eax
00AA306F mov dword ptr [ebx+4],edx
00AA3072 mov eax,dword ptr [ebp-10h]
00AA3075 movzx eax,byte ptr [eax+8]
00AA3079 mov byte ptr [ebx+8],al
00AA307C jmp 00AA3311
00AA3081 mov eax,dword ptr [ebp-10h]
00AA3084 mov edx,dword ptr [eax+4]
00AA3087 mov eax,dword ptr [eax]
00AA3089 mov dword ptr [ebx],eax
00AA308B mov dword ptr [ebx+4],edx
00AA308E mov eax,dword ptr [ebp-10h]
00AA3091 movzx eax,word ptr [eax+8]
00AA3095 mov word ptr [ebx+8],ax
00AA3099 jmp 00AA3311
00AA309E mov eax,dword ptr [ebp-10h]
00AA30A1 mov edx,dword ptr [eax+4]
00AA30A4 mov eax,dword ptr [eax]
00AA30A6 mov dword ptr [ebx],eax
00AA30A8 mov dword ptr [ebx+4],edx
00AA30AB mov eax,dword ptr [ebp-10h]
00AA30AE movzx eax,word ptr [eax+8]
00AA30B2 mov word ptr [ebx+8],ax
00AA30B6 mov eax,dword ptr [ebp-10h]
00AA30B9 movzx eax,byte ptr [eax+0Ah]
00AA30BD mov byte ptr [ebx+0Ah],al
00AA30C0 jmp 00AA3311
00AA30C5 mov eax,dword ptr [ebp-10h]
00AA30C8 mov edx,dword ptr [eax+4]
00AA30CB mov eax,dword ptr [eax]
00AA30CD mov dword ptr [ebx],eax
00AA30CF mov dword ptr [ebx+4],edx
00AA30D2 mov eax,dword ptr [ebp-10h]
00AA30D5 mov eax,dword ptr [eax+8]
00AA30D8 mov dword ptr [ebx+8],eax
00AA30DB jmp 00AA3311
00AA30E0 mov eax,dword ptr [ebp-10h]
00AA30E3 mov edx,dword ptr [eax+4]
00AA30E6 mov eax,dword ptr [eax]
00AA30E8 mov dword ptr [ebx],eax
00AA30EA mov dword ptr [ebx+4],edx
00AA30ED mov eax,dword ptr [ebp-10h]
00AA30F0 mov eax,dword ptr [eax+8]
00AA30F3 mov dword ptr [ebx+8],eax
00AA30F6 mov eax,dword ptr [ebp-10h]
00AA30F9 movzx eax,byte ptr [eax+0Ch]
00AA30FD mov byte ptr [ebx+0Ch],al
00AA3100 jmp 00AA3311
00AA3105 mov eax,dword ptr [ebp-10h]
00AA3108 mov edx,dword ptr [eax+4]
00AA310B mov eax,dword ptr [eax]
00AA310D mov dword ptr [ebx],eax
00AA310F mov dword ptr [ebx+4],edx
00AA3112 mov eax,dword ptr [ebp-10h]
00AA3115 mov eax,dword ptr [eax+8]
00AA3118 mov dword ptr [ebx+8],eax
00AA311B mov eax,dword ptr [ebp-10h]
00AA311E movzx eax,word ptr [eax+0Ch]
00AA3122 mov word ptr [ebx+0Ch],ax
00AA3126 jmp 00AA3311
00AA312B mov eax,dword ptr [ebp-10h]
00AA312E mov edx,dword ptr [eax+4]
00AA3131 mov eax,dword ptr [eax]
00AA3133 mov dword ptr [ebx],eax
00AA3135 mov dword ptr [ebx+4],edx
00AA3138 mov eax,dword ptr [ebp-10h]
00AA313B mov eax,dword ptr [eax+8]
00AA313E mov dword ptr [ebx+8],eax
00AA3141 mov eax,dword ptr [ebp-10h]
00AA3144 movzx eax,word ptr [eax+0Ch]
00AA3148 mov word ptr [ebx+0Ch],ax
00AA314C mov eax,dword ptr [ebp-10h]
00AA314F movzx eax,byte ptr [eax+0Eh]
00AA3153 mov byte ptr [ebx+0Eh],al
00AA3156 jmp 00AA3311
00AA315B mov edi,ebx
00AA315D mov esi,dword ptr [ebp-10h]
00AA3160 movq xmm0,mmword ptr [esi]
00AA3164 movq mmword ptr [edi],xmm0
00AA3168 movq xmm0,mmword ptr [esi+8]
00AA316D movq mmword ptr [edi+8],xmm0
00AA3172 jmp 00AA3311
00AA3177 cmp dword ptr [ebp+8],20h
00AA317B ja 00AA31B9
00AA317D mov edi,ebx
00AA317F mov esi,dword ptr [ebp-10h]
00AA3182 movq xmm0,mmword ptr [esi]
00AA3186 movq mmword ptr [edi],xmm0
00AA318A movq xmm0,mmword ptr [esi+8]
00AA318F movq mmword ptr [edi+8],xmm0
00AA3194 mov edi,dword ptr [ebp+8]
00AA3197 lea edi,[ebx+edi-10h]
00AA319B mov eax,dword ptr [ebp+8]
00AA319E lea esi,[esi+eax-10h]
00AA31A2 movq xmm0,mmword ptr [esi]
00AA31A6 movq mmword ptr [edi],xmm0
00AA31AA movq xmm0,mmword ptr [esi+8]
00AA31AF movq mmword ptr [edi+8],xmm0
00AA31B4 jmp 00AA3311
00AA31B9 mov eax,ebx
00AA31BB and eax,0Fh
00AA31BE test eax,eax
00AA31C0 je 00AA31FB
00AA31C2 mov edi,ebx
00AA31C4 mov esi,dword ptr [ebp-10h]
00AA31C7 movq xmm0,mmword ptr [esi]
00AA31CB movq mmword ptr [edi],xmm0
00AA31CF movq xmm0,mmword ptr [esi+8]
00AA31D4 movq mmword ptr [edi+8],xmm0
00AA31D9 lea edi,[ebx+eax]
00AA31DC add esi,eax
00AA31DE movq xmm0,mmword ptr [esi]
00AA31E2 movq mmword ptr [edi],xmm0
00AA31E6 movq xmm0,mmword ptr [esi+8]
00AA31EB movq mmword ptr [edi+8],xmm0
00AA31F0 add eax,10h
00AA31F3 sub dword ptr [ebp+8],eax
00AA31F6 add dword ptr [ebp-10h],eax
00AA31F9 add ebx,eax
00AA31FB cmp dword ptr [ebp+8],20h
00AA31FF jbe 00AA328F
00AA3205 mov eax,dword ptr [ebp+8]
00AA3208 shr eax,5
00AA320B mov dword ptr [ebp-14h],eax
00AA320E mov dword ptr [ebp-1Ch],0
00AA3215 mov dword ptr [ebp-18h],0
00AA321C mov dword ptr [ebp-24h],eax
00AA321F mov dword ptr [ebp-20h],0
00AA3226 mov eax,dword ptr [ebp-24h]
00AA3229 mov edx,dword ptr [ebp-20h]
00AA322C test edx,edx
00AA322E ja 00AA3236
00AA3230 jb 00AA3286
00AA3232 test eax,eax
00AA3234 jbe 00AA3286
00AA3236 mov edi,ebx
00AA3238 mov esi,dword ptr [ebp-10h]
00AA323B movq xmm0,mmword ptr [esi]
00AA323F movq mmword ptr [edi],xmm0
00AA3243 movq xmm0,mmword ptr [esi+8]
00AA3248 movq mmword ptr [edi+8],xmm0
00AA324D movq xmm0,mmword ptr [esi+10h]
00AA3252 movq mmword ptr [edi+10h],xmm0
00AA3257 movq xmm0,mmword ptr [esi+18h]
00AA325C movq mmword ptr [edi+18h],xmm0
00AA3261 add dword ptr [ebp-10h],20h
00AA3265 add ebx,20h
00AA3268 mov eax,dword ptr [ebp-1Ch]
00AA326B mov edx,dword ptr [ebp-18h]
00AA326E add eax,1
00AA3271 adc edx,0
00AA3274 mov dword ptr [ebp-1Ch],eax
00AA3277 mov dword ptr [ebp-18h],edx
00AA327A cmp edx,dword ptr [ebp-20h]
00AA327D ja 00AA3286
00AA327F jb 00AA3236
00AA3281 cmp eax,dword ptr [ebp-24h]
00AA3284 jb 00AA3236
00AA3286 mov eax,dword ptr [ebp-14h]
00AA3289 shl eax,5
00AA328C sub dword ptr [ebp+8],eax
00AA328F cmp dword ptr [ebp+8],0
00AA3293 jne 00AA3297
00AA3295 jmp 00AA3311
00AA3297 mov ecx,dword ptr [ebp+8]
00AA329A shr ecx,4
00AA329D mov edx,dword ptr [ebp+8]
00AA32A0 mov eax,ecx
00AA32A2 shl eax,4
00AA32A5 sub edx,eax
00AA32A7 cmp ecx,3
00AA32AA jae 00AA32B3
00AA32AC jmp dword ptr [ecx*4+0AA3364h]
00AA32B3 jmp 00AA3311
00AA32B5 lea edi,[ebx+edx-10h]
00AA32B9 mov esi,dword ptr [ebp-10h]
00AA32BC lea esi,[esi+edx-10h]
00AA32C0 movq xmm0,mmword ptr [esi]
00AA32C4 movq mmword ptr [edi],xmm0
00AA32C8 movq xmm0,mmword ptr [esi+8]
00AA32CD movq mmword ptr [edi+8],xmm0
00AA32D2 jmp 00AA3311
00AA32D4 mov edi,ebx
00AA32D6 mov esi,dword ptr [ebp-10h]
00AA32D9 movq xmm0,mmword ptr [esi]
00AA32DD movq mmword ptr [edi],xmm0
00AA32E1 movq xmm0,mmword ptr [esi+8]
00AA32E6 movq mmword ptr [edi+8],xmm0
00AA32EB add ebx,edx
00AA32ED mov edi,ebx
00AA32EF add esi,edx
00AA32F1 movq xmm0,mmword ptr [esi]
00AA32F5 movq mmword ptr [edi],xmm0
00AA32F9 movq xmm0,mmword ptr [esi+8]
00AA32FE movq mmword ptr [edi+8],xmm0
00AA3303 jmp 00AA3311
00AA3305 mov edi,ebx
00AA3307 mov esi,dword ptr [ebp-10h]
00AA330A mov ecx,8
00AA330F rep movs dword ptr es:[edi],dword ptr [esi]
00AA3311 lea esp,[ebp-0Ch]
00AA3314 pop ebx
00AA3315 pop esi
00AA3316 pop edi
00AA3317 pop ebp
00AA3318 ret 4
switch (len)
00AA2FA4 push esi
00AA2FA5 push ebx
00AA2FA6 sub esp,18h
00AA2FA9 mov dword ptr [ebp-10h],edx
00AA2FAC mov ebx,ecx
00AA2FAE mov eax,dword ptr [ebp+8]
00AA2FB1 cmp eax,11h
00AA2FB4 jae 00AA3177
00AA2FBA jmp dword ptr [eax*4+0AA3320h]
{
case 0:
{
return;
00AA2FC1 jmp 00AA3311
}
case 1:
{
*dst = *src;
00AA2FC6 mov eax,dword ptr [ebp-10h]
00AA2FC9 movzx eax,byte ptr [eax]
00AA2FCC mov byte ptr [ebx],al
return;
00AA2FCE jmp 00AA3311
}
case 2:
{
*(ushort*)(dst) = *(ushort*)(src);
00AA2FD3 mov eax,dword ptr [ebp-10h]
00AA2FD6 movzx eax,word ptr [eax]
00AA2FD9 mov word ptr [ebx],ax
return;
00AA2FDC jmp 00AA3311
}
case 3:
{
*(ushort*)(dst) = *(ushort*)(src);
00AA2FE1 mov eax,dword ptr [ebp-10h]
00AA2FE4 movzx eax,word ptr [eax]
00AA2FE7 mov word ptr [ebx],ax
*(dst + sizeof(ushort)) = *(src + sizeof(ushort));
00AA2FEA mov eax,dword ptr [ebp-10h]
00AA2FED movzx eax,byte ptr [eax+2]
00AA2FF1 mov byte ptr [ebx+2],al
return;
00AA2FF4 jmp 00AA3311
}
case 4:
{
*(uint*)(dst) = *(uint*)(src);
00AA2FF9 mov eax,dword ptr [ebp-10h]
00AA2FFC mov eax,dword ptr [eax]
}
case 4:
{
*(uint*)(dst) = *(uint*)(src);
00AA2FFE mov dword ptr [ebx],eax
return;
00AA3000 jmp 00AA3311
}
case 5:
{
*(uint*)(dst) = *(uint*)(src);
00AA3005 mov eax,dword ptr [ebp-10h]
00AA3008 mov eax,dword ptr [eax]
00AA300A mov dword ptr [ebx],eax
*(dst + sizeof(uint)) = *(src + sizeof(uint));
00AA300C mov eax,dword ptr [ebp-10h]
00AA300F movzx eax,byte ptr [eax+4]
00AA3013 mov byte ptr [ebx+4],al
return;
00AA3016 jmp 00AA3311
}
case 6:
{
*(uint*)(dst) = *(uint*)(src);
00AA301B mov eax,dword ptr [ebp-10h]
00AA301E mov eax,dword ptr [eax]
00AA3020 mov dword ptr [ebx],eax
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
00AA3022 mov eax,dword ptr [ebp-10h]
00AA3025 movzx eax,word ptr [eax+4]
00AA3029 mov word ptr [ebx+4],ax
return;
00AA302D jmp 00AA3311
}
case 7:
{
*(uint*)(dst) = *(uint*)(src);
00AA3032 mov eax,dword ptr [ebp-10h]
00AA3035 mov eax,dword ptr [eax]
00AA3037 mov dword ptr [ebx],eax
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
00AA3039 mov eax,dword ptr [ebp-10h]
00AA303C movzx eax,word ptr [eax+4]
00AA3040 mov word ptr [ebx+4],ax
*(dst + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(uint) + sizeof(ushort));
00AA3044 mov eax,dword ptr [ebp-10h]
00AA3047 movzx eax,byte ptr [eax+6]
00AA304B mov byte ptr [ebx+6],al
return;
00AA304E jmp 00AA3311
}
case 8:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA3053 mov eax,dword ptr [ebp-10h]
00AA3056 mov edx,dword ptr [eax+4]
00AA3059 mov eax,dword ptr [eax]
00AA305B mov dword ptr [ebx],eax
00AA305D mov dword ptr [ebx+4],edx
return;
00AA3060 jmp 00AA3311
}
case 9:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA3065 mov eax,dword ptr [ebp-10h]
00AA3068 mov edx,dword ptr [eax+4]
00AA306B mov eax,dword ptr [eax]
00AA306D mov dword ptr [ebx],eax
00AA306F mov dword ptr [ebx+4],edx
*(dst + sizeof(ulong)) = *(src + sizeof(ulong));
00AA3072 mov eax,dword ptr [ebp-10h]
00AA3075 movzx eax,byte ptr [eax+8]
00AA3079 mov byte ptr [ebx+8],al
return;
00AA307C jmp 00AA3311
}
case 10:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA3081 mov eax,dword ptr [ebp-10h]
00AA3084 mov edx,dword ptr [eax+4]
00AA3087 mov eax,dword ptr [eax]
00AA3089 mov dword ptr [ebx],eax
00AA308B mov dword ptr [ebx+4],edx
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
00AA308E mov eax,dword ptr [ebp-10h]
00AA3091 movzx eax,word ptr [eax+8]
00AA3095 mov word ptr [ebx+8],ax
return;
00AA3099 jmp 00AA3311
}
case 11:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA309E mov eax,dword ptr [ebp-10h]
00AA30A1 mov edx,dword ptr [eax+4]
00AA30A4 mov eax,dword ptr [eax]
00AA30A6 mov dword ptr [ebx],eax
00AA30A8 mov dword ptr [ebx+4],edx
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
00AA30AB mov eax,dword ptr [ebp-10h]
00AA30AE movzx eax,word ptr [eax+8]
00AA30B2 mov word ptr [ebx+8],ax
*(dst + sizeof(ulong) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(ushort));
00AA30B6 mov eax,dword ptr [ebp-10h]
00AA30B9 movzx eax,byte ptr [eax+0Ah]
00AA30BD mov byte ptr [ebx+0Ah],al
return;
00AA30C0 jmp 00AA3311
}
case 12:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA30C5 mov eax,dword ptr [ebp-10h]
00AA30C8 mov edx,dword ptr [eax+4]
00AA30CB mov eax,dword ptr [eax]
00AA30CD mov dword ptr [ebx],eax
00AA30CF mov dword ptr [ebx+4],edx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00AA30D2 mov eax,dword ptr [ebp-10h]
00AA30D5 mov eax,dword ptr [eax+8]
00AA30D8 mov dword ptr [ebx+8],eax
return;
00AA30DB jmp 00AA3311
}
case 13:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA30E0 mov eax,dword ptr [ebp-10h]
00AA30E3 mov edx,dword ptr [eax+4]
00AA30E6 mov eax,dword ptr [eax]
00AA30E8 mov dword ptr [ebx],eax
00AA30EA mov dword ptr [ebx+4],edx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00AA30ED mov eax,dword ptr [ebp-10h]
00AA30F0 mov eax,dword ptr [eax+8]
00AA30F3 mov dword ptr [ebx+8],eax
*(dst + sizeof(ulong) + sizeof(uint)) = *(src + sizeof(ulong) + sizeof(uint));
00AA30F6 mov eax,dword ptr [ebp-10h]
00AA30F9 movzx eax,byte ptr [eax+0Ch]
00AA30FD mov byte ptr [ebx+0Ch],al
return;
00AA3100 jmp 00AA3311
}
case 14:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA3105 mov eax,dword ptr [ebp-10h]
00AA3108 mov edx,dword ptr [eax+4]
00AA310B mov eax,dword ptr [eax]
00AA310D mov dword ptr [ebx],eax
00AA310F mov dword ptr [ebx+4],edx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00AA3112 mov eax,dword ptr [ebp-10h]
00AA3115 mov eax,dword ptr [eax+8]
00AA3118 mov dword ptr [ebx+8],eax
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
00AA311B mov eax,dword ptr [ebp-10h]
00AA311E movzx eax,word ptr [eax+0Ch]
00AA3122 mov word ptr [ebx+0Ch],ax
return;
00AA3126 jmp 00AA3311
}
case 15:
{
*(ulong*)(dst) = *(ulong*)(src);
00AA312B mov eax,dword ptr [ebp-10h]
00AA312E mov edx,dword ptr [eax+4]
00AA3131 mov eax,dword ptr [eax]
00AA3133 mov dword ptr [ebx],eax
00AA3135 mov dword ptr [ebx+4],edx
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
00AA3138 mov eax,dword ptr [ebp-10h]
00AA313B mov eax,dword ptr [eax+8]
00AA313E mov dword ptr [ebx+8],eax
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
00AA3141 mov eax,dword ptr [ebp-10h]
00AA3144 movzx eax,word ptr [eax+0Ch]
00AA3148 mov word ptr [ebx+0Ch],ax
*(dst + sizeof(ulong) + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(uint) + sizeof(ushort));
00AA314C mov eax,dword ptr [ebp-10h]
00AA314F movzx eax,byte ptr [eax+0Eh]
00AA3153 mov byte ptr [ebx+0Eh],al
return;
00AA3156 jmp 00AA3311
}
case 16:
{
*(UInt128*)(dst) = *(UInt128*)(src);
00AA315B mov edi,ebx
00AA315D mov esi,dword ptr [ebp-10h]
00AA3160 movq xmm0,mmword ptr [esi]
00AA3164 movq mmword ptr [edi],xmm0
00AA3168 movq xmm0,mmword ptr [esi+8]
00AA316D movq mmword ptr [edi+8],xmm0
return;
00AA3172 jmp 00AA3311
}
}
if (len <= 32)
00AA3177 cmp dword ptr [ebp+8],20h
00AA317B ja 00AA31B9
{
// We can do this in two writes. Note that one or both of these writes may be misaligned
*(UInt128*)(dst) = *(UInt128*)(src);
00AA317D mov edi,ebx
00AA317F mov esi,dword ptr [ebp-10h]
00AA3182 movq xmm0,mmword ptr [esi]
00AA3186 movq mmword ptr [edi],xmm0
00AA318A movq xmm0,mmword ptr [esi+8]
00AA318F movq mmword ptr [edi+8],xmm0
*(UInt128*)(dst + len - sizeof_UInt128) = *(UInt128*)(src + len - sizeof_UInt128);
00AA3194 mov edi,dword ptr [ebp+8]
00AA3197 lea edi,[ebx+edi-10h]
00AA319B mov eax,dword ptr [ebp+8]
00AA319E lea esi,[esi+eax-10h]
00AA31A2 movq xmm0,mmword ptr [esi]
00AA31A6 movq mmword ptr [edi],xmm0
00AA31AA movq xmm0,mmword ptr [esi+8]
00AA31AF movq mmword ptr [edi+8],xmm0
return;
00AA31B4 jmp 00AA3311
}
var misalignment = ((nuint)(dst) % sizeof_UInt128);
00AA31B9 mov eax,ebx
00AA31BB and eax,0Fh
if (misalignment != 0)
00AA31BE test eax,eax
if (misalignment != 0)
00AA31C0 je 00AA31FB
{
*(UInt128*)(dst) = *(UInt128*)(src);
00AA31C2 mov edi,ebx
00AA31C4 mov esi,dword ptr [ebp-10h]
00AA31C7 movq xmm0,mmword ptr [esi]
00AA31CB movq mmword ptr [edi],xmm0
00AA31CF movq xmm0,mmword ptr [esi+8]
00AA31D4 movq mmword ptr [edi+8],xmm0
*(UInt128*)(dst + misalignment) = *(UInt128*)(src + misalignment);
00AA31D9 lea edi,[ebx+eax]
00AA31DC add esi,eax
00AA31DE movq xmm0,mmword ptr [esi]
00AA31E2 movq mmword ptr [edi],xmm0
00AA31E6 movq xmm0,mmword ptr [esi+8]
00AA31EB movq mmword ptr [edi+8],xmm0
var initialOffset = (sizeof_UInt128 + misalignment);
00AA31F0 add eax,10h
len -= initialOffset;
00AA31F3 sub dword ptr [ebp+8],eax
src += initialOffset;
00AA31F6 add dword ptr [ebp-10h],eax
dst += initialOffset;
00AA31F9 add ebx,eax
}
#if BIT64
const nuint blockSize = sizeof_UInt512;
#else
const nuint blockSize = sizeof_UInt256;
#endif
if (len > blockSize)
00AA31FB cmp dword ptr [ebp+8],20h
00AA31FF jbe 00AA328F
{
var iterations = (len / blockSize);
00AA3205 mov eax,dword ptr [ebp+8]
00AA3208 shr eax,5
00AA320B mov dword ptr [ebp-14h],eax
for (var iteration = 0ul; iteration < iterations; iteration++)
00AA320E mov dword ptr [ebp-1Ch],0
00AA3215 mov dword ptr [ebp-18h],0
for (var iteration = 0ul; iteration < iterations; iteration++)
00AA321C mov dword ptr [ebp-24h],eax
00AA321F mov dword ptr [ebp-20h],0
00AA3226 mov eax,dword ptr [ebp-24h]
00AA3229 mov edx,dword ptr [ebp-20h]
00AA322C test edx,edx
00AA322E ja 00AA3236
00AA3230 jb 00AA3286
00AA3232 test eax,eax
00AA3234 jbe 00AA3286
{
#if BIT64
*(UInt512*)(dst) = *(UInt512*)(src);
#else
*(UInt256*)(dst) = *(UInt256*)(src);
00AA3236 mov edi,ebx
{
#if BIT64
*(UInt512*)(dst) = *(UInt512*)(src);
#else
*(UInt256*)(dst) = *(UInt256*)(src);
00AA3238 mov esi,dword ptr [ebp-10h]
00AA323B movq xmm0,mmword ptr [esi]
00AA323F movq mmword ptr [edi],xmm0
00AA3243 movq xmm0,mmword ptr [esi+8]
00AA3248 movq mmword ptr [edi+8],xmm0
00AA324D movq xmm0,mmword ptr [esi+10h]
00AA3252 movq mmword ptr [edi+10h],xmm0
00AA3257 movq xmm0,mmword ptr [esi+18h]
00AA325C movq mmword ptr [edi+18h],xmm0
#endif
src += blockSize;
00AA3261 add dword ptr [ebp-10h],20h
dst += blockSize;
00AA3265 add ebx,20h
for (var iteration = 0ul; iteration < iterations; iteration++)
00AA3268 mov eax,dword ptr [ebp-1Ch]
00AA326B mov edx,dword ptr [ebp-18h]
00AA326E add eax,1
00AA3271 adc edx,0
00AA3274 mov dword ptr [ebp-1Ch],eax
00AA3277 mov dword ptr [ebp-18h],edx
00AA327A cmp edx,dword ptr [ebp-20h]
00AA327D ja 00AA3286
00AA327F jb 00AA3236
00AA3281 cmp eax,dword ptr [ebp-24h]
00AA3284 jb 00AA3236
}
len -= (iterations * blockSize);
00AA3286 mov eax,dword ptr [ebp-14h]
00AA3289 shl eax,5
00AA328C sub dword ptr [ebp+8],eax
}
if (len == 0)
00AA328F cmp dword ptr [ebp+8],0
00AA3293 jne 00AA3297
{
return;
00AA3295 jmp 00AA3311
}
var remainingBlocks = (len / sizeof_UInt128);
00AA3297 mov ecx,dword ptr [ebp+8]
00AA329A shr ecx,4
var remainingBytes = (len - (remainingBlocks * sizeof_UInt128));
00AA329D mov edx,dword ptr [ebp+8]
00AA32A0 mov eax,ecx
00AA32A2 shl eax,4
00AA32A5 sub edx,eax
switch (remainingBlocks)
00AA32A7 cmp ecx,3
00AA32AA jae 00AA32B3
00AA32AC jmp dword ptr [ecx*4+0AA3364h]
00AA32B3 jmp 00AA3311
{
case 0:
{
*(UInt128*)(dst - sizeof_UInt128 + remainingBytes) = *(UInt128*)(src - sizeof_UInt128 + remainingBytes);
00AA32B5 lea edi,[ebx+edx-10h]
00AA32B9 mov esi,dword ptr [ebp-10h]
00AA32BC lea esi,[esi+edx-10h]
00AA32C0 movq xmm0,mmword ptr [esi]
00AA32C4 movq mmword ptr [edi],xmm0
00AA32C8 movq xmm0,mmword ptr [esi+8]
00AA32CD movq mmword ptr [edi+8],xmm0
return;
00AA32D2 jmp 00AA3311
}
case 1:
{
*(UInt128*)(dst) = *(UInt128*)(src);
00AA32D4 mov edi,ebx
00AA32D6 mov esi,dword ptr [ebp-10h]
00AA32D9 movq xmm0,mmword ptr [esi]
00AA32DD movq mmword ptr [edi],xmm0
00AA32E1 movq xmm0,mmword ptr [esi+8]
00AA32E6 movq mmword ptr [edi+8],xmm0
*(UInt128*)(dst + remainingBytes) = *(UInt128*)(src + remainingBytes);
00AA32EB add ebx,edx
00AA32ED mov edi,ebx
00AA32EF add esi,edx
00AA32F1 movq xmm0,mmword ptr [esi]
00AA32F5 movq mmword ptr [edi],xmm0
00AA32F9 movq xmm0,mmword ptr [esi+8]
00AA32FE movq mmword ptr [edi+8],xmm0
return;
00AA3303 jmp 00AA3311
}
#if BIT64
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
*(UInt128*)(dst + sizeof_UInt128 + remainingBytes) = *(UInt128*)(src + sizeof_UInt128 + remainingBytes);
return;
}
case 3:
{
*(UInt256*)(dst) = *(UInt256*)(src);
*(UInt128*)(dst + sizeof_UInt256) = *(UInt128*)(src + sizeof_UInt256);
*(UInt128*)(dst + sizeof_UInt256 + remainingBytes) = *(UInt128*)(src + sizeof_UInt256 + remainingBytes);
return;
}
case 4:
{
*(UInt512*)(dst) = *(UInt512*)(src);
return;
}
#else
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
00AA3305 mov edi,ebx
00AA3307 mov esi,dword ptr [ebp-10h]
00AA330A mov ecx,8
00AA330F rep movs dword ptr es:[edi],dword ptr [esi]
00AA3311 lea esp,[ebp-0Ch]
00AA3314 pop ebx
00AA3315 pop esi
00AA3316 pop edi
00AA3317 pop ebp
00AA3318 ret 4
#if BIT64
using nuint = System.UInt64;
#else // BIT64
using nuint = System.UInt32;
#endif // BIT64
const int sizeof_UInt128 = sizeof(ulong) * 2;
[StructLayout(LayoutKind.Sequential, Pack = 16, Size = sizeof_UInt128)]
struct UInt128
{
public ulong loPart;
public ulong hiPart;
}
const int sizeof_UInt256 = sizeof_UInt128 * 2;
[StructLayout(LayoutKind.Sequential, Pack = 16, Size = sizeof_UInt256)]
struct UInt256
{
public UInt128 loPart;
public UInt128 hiPart;
}
#if BIT64
const int sizeof_UInt512 = sizeof_UInt256 * 2;
[StructLayout(LayoutKind.Sequential, Pack = 16, Size = sizeof_UInt512)]
struct UInt512
{
public UInt256 loPart;
public UInt256 hiPart;
}
#endif
[CLSCompliant(false)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[SecurityCritical]
internal unsafe static void Memmove(byte* dst, byte* src, nuint len)
{
if ((src < dst) && ((src + len) > dst))
{
CopyDown(dst, src, len);
}
else
{
CopyUp(dst, src, len);
}
}
private unsafe static void CopyUp(byte* dst, byte* src, nuint len)
{
switch (len)
{
case 0:
{
return;
}
case 1:
{
*dst = *src;
return;
}
case 2:
{
*(ushort*)(dst) = *(ushort*)(src);
return;
}
case 3:
{
*(ushort*)(dst) = *(ushort*)(src);
*(dst + sizeof(ushort)) = *(src + sizeof(ushort));
return;
}
case 4:
{
*(uint*)(dst) = *(uint*)(src);
return;
}
case 5:
{
*(uint*)(dst) = *(uint*)(src);
*(dst + sizeof(uint)) = *(src + sizeof(uint));
return;
}
case 6:
{
*(uint*)(dst) = *(uint*)(src);
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
return;
}
case 7:
{
*(uint*)(dst) = *(uint*)(src);
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
*(dst + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(uint) + sizeof(ushort));
return;
}
case 8:
{
*(ulong*)(dst) = *(ulong*)(src);
return;
}
case 9:
{
*(ulong*)(dst) = *(ulong*)(src);
*(dst + sizeof(ulong)) = *(src + sizeof(ulong));
return;
}
case 10:
{
*(ulong*)(dst) = *(ulong*)(src);
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
return;
}
case 11:
{
*(ulong*)(dst) = *(ulong*)(src);
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
*(dst + sizeof(ulong) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(ushort));
return;
}
case 12:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
return;
}
case 13:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
*(dst + sizeof(ulong) + sizeof(uint)) = *(src + sizeof(ulong) + sizeof(uint));
return;
}
case 14:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
return;
}
case 15:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
*(dst + sizeof(ulong) + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(uint) + sizeof(ushort));
return;
}
case 16:
{
*(UInt128*)(dst) = *(UInt128*)(src);
return;
}
}
if (len <= 32)
{
// We can do this in two writes. Note that one or both of these writes may be misaligned
*(UInt128*)(dst) = *(UInt128*)(src);
*(UInt128*)(dst + len - sizeof_UInt128) = *(UInt128*)(src + len - sizeof_UInt128);
return;
}
var misalignment = ((nuint)(dst) % sizeof_UInt128);
if (misalignment != 0)
{
*(UInt128*)(dst) = *(UInt128*)(src);
*(UInt128*)(dst + misalignment) = *(UInt128*)(src + misalignment);
var initialOffset = (sizeof_UInt128 + misalignment);
len -= initialOffset;
src += initialOffset;
dst += initialOffset;
}
#if BIT64
const nuint blockSize = sizeof_UInt512;
#else
const nuint blockSize = sizeof_UInt256;
#endif
if (len > blockSize)
{
var iterations = (len / blockSize);
for (var iteration = 0ul; iteration < iterations; iteration++)
{
#if BIT64
*(UInt512*)(dst) = *(UInt512*)(src);
#else
*(UInt256*)(dst) = *(UInt256*)(src);
#endif
src += blockSize;
dst += blockSize;
}
len -= (iterations * blockSize);
}
if (len == 0)
{
return;
}
var remainingBlocks = (len / sizeof_UInt128);
var remainingBytes = (len - (remainingBlocks * sizeof_UInt128));
switch (remainingBlocks)
{
case 0:
{
*(UInt128*)(dst - sizeof_UInt128 + remainingBytes) = *(UInt128*)(src - sizeof_UInt128 + remainingBytes);
return;
}
case 1:
{
*(UInt128*)(dst) = *(UInt128*)(src);
*(UInt128*)(dst + remainingBytes) = *(UInt128*)(src + remainingBytes);
return;
}
#if BIT64
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
*(UInt128*)(dst + sizeof_UInt128 + remainingBytes) = *(UInt128*)(src + sizeof_UInt128 + remainingBytes);
return;
}
case 3:
{
*(UInt256*)(dst) = *(UInt256*)(src);
*(UInt128*)(dst + sizeof_UInt256) = *(UInt128*)(src + sizeof_UInt256);
*(UInt128*)(dst + sizeof_UInt256 + remainingBytes) = *(UInt128*)(src + sizeof_UInt256 + remainingBytes);
return;
}
case 4:
{
*(UInt512*)(dst) = *(UInt512*)(src);
return;
}
#else
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
return;
}
#endif
}
}
private unsafe static void CopyDown(byte* dst, byte* src, nuint len)
{
Buffer.MemoryCopy(src, dst, len, len);
}
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
#if BIT64
using nuint = System.UInt64;
#else // BIT64
using nuint = System.UInt32;
#endif // BIT64
namespace ConsoleApplication1
{
unsafe class Program
{
const int sizeof_UInt128 = sizeof(ulong) * 2;
[StructLayout(LayoutKind.Sequential, Pack = 16, Size = sizeof_UInt128)]
struct UInt128
{
public ulong loPart;
public ulong hiPart;
}
const int sizeof_UInt256 = sizeof_UInt128 * 2;
[StructLayout(LayoutKind.Sequential, Pack = 16, Size = sizeof_UInt256)]
struct UInt256
{
public UInt128 loPart;
public UInt128 hiPart;
}
#if BIT64
const int sizeof_UInt512 = sizeof_UInt256 * 2;
[StructLayout(LayoutKind.Sequential, Pack = 16, Size = sizeof_UInt512)]
struct UInt512
{
public UInt256 loPart;
public UInt256 hiPart;
}
#endif
static void Main(string[] args)
{
for (var byteCount = 0; byteCount < 1024; byteCount++)
{
for (var sourceOffset = 0; sourceOffset < 16; sourceOffset++)
{
for (var destinationOffset = 0; destinationOffset < 16; destinationOffset++)
{
var source = IntPtr.Zero;
var destination = IntPtr.Zero;
try
{
source = Marshal.AllocHGlobal(byteCount + sourceOffset);
RandomizeMemory(source, byteCount + sourceOffset);
var pSource = (byte*)(source.ToPointer()) + sourceOffset;
destination = Marshal.AllocHGlobal(byteCount + destinationOffset);
ZeroMemory(destination, byteCount + destinationOffset);
var pDestination = (byte*)(destination.ToPointer()) + destinationOffset;
Memmove(pDestination, pSource, (nuint)(byteCount));
ValidateMemory((IntPtr)pSource, (IntPtr)pDestination, byteCount);
}
finally
{
if (source != IntPtr.Zero)
{
Marshal.FreeHGlobal(source);
}
if (destination != IntPtr.Zero)
{
Marshal.FreeHGlobal(destination);
}
}
}
}
}
}
[CLSCompliant(false)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[SecurityCritical]
internal unsafe static void Memmove(byte* dst, byte* src, nuint len)
{
if ((src < dst) && ((src + len) > dst))
{
CopyDown(dst, src, len);
}
else
{
CopyUp(dst, src, len);
}
}
private unsafe static void CopyUp(byte* dst, byte* src, nuint len)
{
switch (len)
{
case 0:
{
return;
}
case 1:
{
*dst = *src;
return;
}
case 2:
{
*(ushort*)(dst) = *(ushort*)(src);
return;
}
case 3:
{
*(ushort*)(dst) = *(ushort*)(src);
*(dst + sizeof(ushort)) = *(src + sizeof(ushort));
return;
}
case 4:
{
*(uint*)(dst) = *(uint*)(src);
return;
}
case 5:
{
*(uint*)(dst) = *(uint*)(src);
*(dst + sizeof(uint)) = *(src + sizeof(uint));
return;
}
case 6:
{
*(uint*)(dst) = *(uint*)(src);
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
return;
}
case 7:
{
*(uint*)(dst) = *(uint*)(src);
*(ushort*)(dst + sizeof(uint)) = *(ushort*)(src + sizeof(uint));
*(dst + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(uint) + sizeof(ushort));
return;
}
case 8:
{
*(ulong*)(dst) = *(ulong*)(src);
return;
}
case 9:
{
*(ulong*)(dst) = *(ulong*)(src);
*(dst + sizeof(ulong)) = *(src + sizeof(ulong));
return;
}
case 10:
{
*(ulong*)(dst) = *(ulong*)(src);
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
return;
}
case 11:
{
*(ulong*)(dst) = *(ulong*)(src);
*(ushort*)(dst + sizeof(ulong)) = *(ushort*)(src + sizeof(ulong));
*(dst + sizeof(ulong) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(ushort));
return;
}
case 12:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
return;
}
case 13:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
*(dst + sizeof(ulong) + sizeof(uint)) = *(src + sizeof(ulong) + sizeof(uint));
return;
}
case 14:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
return;
}
case 15:
{
*(ulong*)(dst) = *(ulong*)(src);
*(uint*)(dst + sizeof(ulong)) = *(uint*)(src + sizeof(ulong));
*(ushort*)(dst + sizeof(ulong) + sizeof(uint)) = *(ushort*)(src + sizeof(ulong) + sizeof(uint));
*(dst + sizeof(ulong) + sizeof(uint) + sizeof(ushort)) = *(src + sizeof(ulong) + sizeof(uint) + sizeof(ushort));
return;
}
case 16:
{
*(UInt128*)(dst) = *(UInt128*)(src);
return;
}
}
if (len <= 32)
{
// We can do this in two writes. Note that one or both of these writes may be misaligned
*(UInt128*)(dst) = *(UInt128*)(src);
*(UInt128*)(dst + len - sizeof_UInt128) = *(UInt128*)(src + len - sizeof_UInt128);
return;
}
var misalignment = ((nuint)(dst) % sizeof_UInt128);
if (misalignment != 0)
{
*(UInt128*)(dst) = *(UInt128*)(src);
*(UInt128*)(dst + misalignment) = *(UInt128*)(src + misalignment);
var initialOffset = (sizeof_UInt128 + misalignment);
len -= initialOffset;
src += initialOffset;
dst += initialOffset;
}
#if BIT64
const nuint blockSize = sizeof_UInt512;
#else
const nuint blockSize = sizeof_UInt256;
#endif
if (len > blockSize)
{
var iterations = (len / blockSize);
for (var iteration = 0ul; iteration < iterations; iteration++)
{
#if BIT64
*(UInt512*)(dst) = *(UInt512*)(src);
#else
*(UInt256*)(dst) = *(UInt256*)(src);
#endif
src += blockSize;
dst += blockSize;
}
len -= (iterations * blockSize);
}
if (len == 0)
{
return;
}
var remainingBlocks = (len / sizeof_UInt128);
var remainingBytes = (len - (remainingBlocks * sizeof_UInt128));
switch (remainingBlocks)
{
case 0:
{
*(UInt128*)(dst - sizeof_UInt128 + remainingBytes) = *(UInt128*)(src - sizeof_UInt128 + remainingBytes);
return;
}
case 1:
{
*(UInt128*)(dst) = *(UInt128*)(src);
*(UInt128*)(dst + remainingBytes) = *(UInt128*)(src + remainingBytes);
return;
}
#if BIT64
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
*(UInt128*)(dst + sizeof_UInt128 + remainingBytes) = *(UInt128*)(src + sizeof_UInt128 + remainingBytes);
return;
}
case 3:
{
*(UInt256*)(dst) = *(UInt256*)(src);
*(UInt128*)(dst + sizeof_UInt256) = *(UInt128*)(src + sizeof_UInt256);
*(UInt128*)(dst + sizeof_UInt256 + remainingBytes) = *(UInt128*)(src + sizeof_UInt256 + remainingBytes);
return;
}
case 4:
{
*(UInt512*)(dst) = *(UInt512*)(src);
return;
}
#else
case 2:
{
*(UInt256*)(dst) = *(UInt256*)(src);
return;
}
#endif
}
}
private unsafe static void CopyDown(byte* dst, byte* src, nuint len)
{
Buffer.MemoryCopy(src, dst, len, len);
}
static void RandomizeMemory(IntPtr destination, int byteCount)
{
var pDestination = (byte*)(destination.ToPointer());
var rng = new Random();
for (var index = 0; index < byteCount; index++)
{
*(pDestination + index) = (byte)(rng.Next(byte.MinValue, byte.MaxValue));
}
}
static void ZeroMemory(IntPtr destination, int byteCount)
{
var pDestination = (byte*)(destination.ToPointer());
for (var index = 0; index < byteCount; index++)
{
*(pDestination + index) = 0;
}
}
static void ValidateMemory(IntPtr source, IntPtr destination, int byteCount)
{
var pSource = (byte*)(source.ToPointer());
var pDestination = (byte*)(destination.ToPointer());
for (var index = 0; index < byteCount; index++)
{
var areEqual = (*(pDestination + index) == *(pSource + index));
if (!areEqual)
{
System.Diagnostics.Debugger.Break();
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment