Skip to content

Instantly share code, notes, and snippets.

@gfoidl
Last active March 31, 2018 10:02
Show Gist options
  • Save gfoidl/9138af7412b0adcab134a7cbc850d589 to your computer and use it in GitHub Desktop.
Save gfoidl/9138af7412b0adcab134a7cbc850d589 to your computer and use it in GitHub Desktop.
Pointer alignment
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace ConsoleApp1
{
class Program
{
static unsafe void Main(string[] args)
{
double[] arr = new double[1_000]; // is aligned (from GC / runtime)
Span<double> span = arr.AsSpan(1); // is not aligned by slicing, so next boundary will be at +3 = 4 again
fixed (double* pArr = &MemoryMarshal.GetReference(span))
{
double* ptr = pArr;
int elementsToAlign = PointerHelper.GetElementsToAlign<double>(ptr);
#if DEBUG
Console.WriteLine(elementsToAlign);
#endif
while (elementsToAlign > 0)
{
elementsToAlign--;
ptr++;
}
elementsToAlign = PointerHelper.GetElementsToAlign<double>(ptr);
#if DEBUG
Console.WriteLine(elementsToAlign);
#endif
}
}
}
//-------------------------------------------------------------------------
internal static unsafe class PointerHelper
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetElementsToAlign<T>(void* ptr) where T : struct
{
//const int elementsPerByte = sizeof(double) / sizeof(byte);
int elementsPerByte = Unsafe.SizeOf<T>() / sizeof(byte);
int sizeOfVector = Unsafe.SizeOf<Vector<T>>();
int vectorElements = Vector<T>.Count;
int address = (int)ptr;
int unalignedBytes = address & (sizeOfVector - 1);
int unalignedElements = unalignedBytes / elementsPerByte;
// (vectorElements - unalignedElements) would be OK, but only in the case
// unalignedElements > 0. For the 0-case the % has to be done.
int elementsToAlign = (vectorElements - unalignedElements) & (vectorElements - 1);
return elementsToAlign;
/*
* Bit hack for modulus: https://graphics.stanford.edu/~seander/bithacks.html#ModulusDivisionEasy
* a % b = a & (b - 1)
* is way faster :-)
*
* See also:
* https://github.com/ahsonkhan/coreclr/blob/46b075fc1877e7087da53579c13c8c9069058b42/src/mscorlib/shared/System/SpanHelpers.Char.cs#L95-L97
* https://github.com/ahsonkhan/coreclr/blob/46b075fc1877e7087da53579c13c8c9069058b42/src/mscorlib/shared/System/SpanHelpers.Char.cs#L131
*/
}
}
}
@gfoidl
Copy link
Author

gfoidl commented Mar 31, 2018

int sizeOfVector = Unsafe.SizeOf<Vector<T>>();

The JIT will treat this as constant, but it can also be written as

int sizeOfVector = Vector<byte>.Count;

because it is always the size of the SIMD-register.

Type Count in bits
SSE 128
AVX (2) 256
AVX 512 512

@gfoidl
Copy link
Author

gfoidl commented Mar 31, 2018

JIT will produce

; Assembly listing for method Program:Do(long):int
; Emitting BLENDED_CODE for X64 CPU with AVX
; optimized code
; rbp based frame
; fully interruptible
; Final local variable assignments
;
;  V00 arg0         [V00,T01] (  6, 12   )    long  ->  rdi        
;  V01 loc0         [V01,T00] (  7, 16   )     int  ->  rax        
;* V02 tmp0         [V02    ] (  0,  0   )     int  ->  zero-ref   
;* V03 tmp1         [V03    ] (  0,  0   )     int  ->  zero-ref   
;* V04 tmp2         [V04    ] (  0,  0   )     int  ->  zero-ref   
;  V05 tmp3         [V05,T04] (  2,  2   )     int  ->  rax        
;* V06 tmp4         [V06    ] (  0,  0   )     int  ->  zero-ref   
;* V07 tmp5         [V07    ] (  0,  0   )     int  ->  zero-ref   
;* V08 tmp6         [V08    ] (  0,  0   )     int  ->  zero-ref   
;  V09 tmp7         [V09,T05] (  2,  2   )     int  ->  rax        
;# V10 OutArgs      [V10    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]  
;  V11 rat0         [V11,T02] (  3,  6   )     int  ->  rax        
;  V12 rat1         [V12,T03] (  3,  6   )     int  ->  rax        
;
; Lcl frame size = 0

G_M56189_IG01:
       55                   push     rbp
       488BEC               mov      rbp, rsp

G_M56189_IG02:
       8BC7                 mov      eax, edi
       83E01F               and      eax, 31
       8BF0                 mov      esi, eax
       C1FE1F               sar      esi, 31
       83E607               and      esi, 7
       03F0                 add      esi, eax
       8BC6                 mov      eax, esi
       C1F803               sar      eax, 3
       F7D8                 neg      eax
       83C004               add      eax, 4
       83E003               and      eax, 3
       85C0                 test     eax, eax
       7E0A                 jle      SHORT G_M56189_IG04

G_M56189_IG03:
       FFC8                 dec      eax
       4883C708             add      rdi, 8
       85C0                 test     eax, eax
       7FF6                 jg       SHORT G_M56189_IG03

G_M56189_IG04:
       8BC7                 mov      eax, edi
       83E01F               and      eax, 31
       8BF8                 mov      edi, eax
       C1FF1F               sar      edi, 31
       83E707               and      edi, 7
       03F8                 add      edi, eax
       8BC7                 mov      eax, edi
       C1F803               sar      eax, 3
       F7D8                 neg      eax
       83C004               add      eax, 4
       83E003               and      eax, 3

G_M56189_IG05:
       5D                   pop      rbp
       C3                   ret      

; Total bytes of code 76, prolog size 4 for method Program:Do(long):int
; ============================================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment