Last active
October 30, 2020 15:21
-
-
Save gfoidl/346ea7472f06e2024b8bdc9c598e547e to your computer and use it in GitHub Desktop.
.NET Array cache line alignment / length -- false sharing caution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Linq; | |
namespace ConsoleApp3 | |
{ | |
static unsafe class Program | |
{ | |
static void Main() | |
{ | |
int[] arr = Enumerable.Range(2, 122).ToArray(); | |
fixed (int* pArr = arr) | |
{ | |
PrintCacheInfo(pArr - 2 , "(IntPtr)ptr - 1 = Length"); | |
PrintCacheInfo(pArr , "arr[0]"); | |
PrintCacheInfo(pArr + 64, "arr[64]"); | |
} | |
} | |
private static void PrintCacheInfo(int* ptr, string caption) | |
{ | |
const int cacheBlockBits = 6; | |
const int cacheBlockSize = 1 << cacheBlockBits; // 64 | |
const int cacheBlockMask = cacheBlockSize - 1; // 63 | |
long addr = (long)ptr; | |
long cacheBlockOffset = addr & cacheBlockMask; // % 64 | |
long cacheBlockAlignedAddr = addr & ~cacheBlockMask; | |
Console.WriteLine(caption); | |
Console.WriteLine($"Address: 0x{addr:X}"); | |
Console.WriteLine($"Cache line addr: 0x{cacheBlockAlignedAddr:X}"); | |
Console.WriteLine($"Offset in cl: {cacheBlockOffset} | 0x{cacheBlockOffset:X}"); | |
Console.WriteLine($"Value: {*ptr}"); | |
Console.WriteLine(); | |
} | |
} | |
} |
Demo for false sharing
Result
C# code
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;
namespace ConsoleApplication2
{
class Program
{
#if DEBUG
private const int ITERS = 2;
#else
private const int ITERS = 100_000_000;
#endif
//---------------------------------------------------------------------
private static int[] s_counter = new int[1 << 20];
private static CountdownEvent s_cde = new CountdownEvent(4);
private static ManualResetEventSlim s_mre = new ManualResetEventSlim(false);
private static Stopwatch s_watch = new Stopwatch();
//---------------------------------------------------------------------
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static void UpdateCounter(int position)
{
s_cde.Signal();
s_mre.Wait();
for (int j = 0; j < ITERS; j++)
{
s_counter[position] += 3; // any number, just to do a read and a write
}
}
//---------------------------------------------------------------------
static void Main()
{
// Parallel.Invoke will run one task in the main-thread (inlineing of tasks),
// so be aware of potential deadlocks. Thus "free" the main-thread.
// JIT
s_watch.Start();
Task.Run(() =>
Parallel.Invoke(
() => UpdateCounter(64 * 0 / sizeof(int)),
() => UpdateCounter(64 * 1 / sizeof(int)),
() => UpdateCounter(64 * 2 / sizeof(int)),
() => UpdateCounter(64 * 3 / sizeof(int))
)
);
s_cde .Wait();
s_mre .Set();
s_watch.Stop();
Run(padFirstElement: false, "results-no-padfirst.csv");
Run(padFirstElement: true , "results-padfirst.csv");
}
//---------------------------------------------------------------------
private static void Run(bool padFirstElement, string resultsFileName)
{
int pad = padFirstElement ? 1 : 0;
using (StreamWriter sw = File.CreateText(resultsFileName))
{
sw.WriteLine("#spacing [bytes];time [ms]"); // # for GnuPlot
for (int i = 0; i < 15; ++i)
{
int spacing = 1 << i;
int spacingInBytes = spacing * sizeof(int);
Console.Write($"spacing: {spacingInBytes,5} bytes");
s_cde.Reset();
s_mre.Reset();
var task = Task.Run(() =>
Parallel.Invoke(
() => UpdateCounter(spacingInBytes * (pad + 0) / sizeof(int)),
() => UpdateCounter(spacingInBytes * (pad + 1) / sizeof(int)),
() => UpdateCounter(spacingInBytes * (pad + 2) / sizeof(int)),
() => UpdateCounter(spacingInBytes * (pad + 3) / sizeof(int))
)
);
s_cde .Wait();
s_watch.Restart();
s_mre .Set();
task .Wait();
s_watch.Stop();
Console.WriteLine($", time: {s_watch.ElapsedMilliseconds}");
sw.WriteLine($"{spacingInBytes};{s_watch.ElapsedMilliseconds}");
}
}
}
}
}
GnuPlot
reset
set datafile separator ';'
set term pngcairo size 1200,600 enhanced
set output 'false-sharing.png'
set grid
set title 'False sharing on updating counter-array'
set title font ",12"
set xlabel 'spacing [bytes]'
set ylabel 'time [ms]'
set logscale x;
set xtics (4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536)
set arrow from 64, graph 0 to 64, graph 1 nohead lc rgb 'red' lw 1
set label at 60,1000 'Cache line size' rotate by 90 front textcolor rgb 'red'
plot'results-no-padfirst.csv' with linespoints lw 2 title 'no padding for first', \
'results-padfirst.csv' with linespoints lw 2 title 'padding for first'
False sharing in Intel VTune Profiler
Microarchitecture Exploration
w/o padding
with padding
Memory Access
w/o padding
with padding
Note: It's L1-bound because of the RAW-hazard from the increment now.
BenchmarkDotNet
Shows this effect with the use of hardware counters also clearly:
Method | Mean | Error | StdDev | Ratio | RatioSD | CacheMisses/Op | LLCReference/Op | LLCMisses/Op |
---|---|---|---|---|---|---|---|---|
NoPadding | 2.325 s | 0.0409 s | 0.0363 s | 1.00 | 0.00 | 1,047,666 | 146,520,838 | 1,048,121 |
Padding | 1.658 s | 0.0239 s | 0.0223 s | 0.71 | 0.02 | 622,924 | 2,056,746 | 622,592 |
demo program
#define PAD
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading.Tasks;
Console.WriteLine($"Size of struct: {Unsafe.SizeOf<Data>()}");
Worker worker = new();
Parallel.Invoke(
() => Console.WriteLine($"x: {worker.IncX()}"),
() => Console.WriteLine($"y: {worker.IncY()}")
);
public class Worker
{
private const uint N = 3_000_000_000;
private Data _data;
public uint IncX()
{
for (uint i = 0; i < N; ++i)
{
_data.X++;
}
return _data.X;
}
public uint IncY()
{
for (uint i = 0; i < N; ++i)
{
_data.Y++;
}
return _data.Y;
}
}
[StructLayout(LayoutKind.Explicit)]
public struct Data
{
[FieldOffset(0)]
public uint X;
#if PAD
[FieldOffset(64)]
#else
[FieldOffset(4)]
#endif
public uint Y;
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output is something like this (win-x64, .NET Core 2.1)
So the
Length
andarr[0]
are in the same cache line. Hence The layout of arrays on the CLR is such that an array’s length resides before the first element. So unless you pad the first element of the array, all accesses will perform bounds checking that touches the first element’s line. Because this line is being mutated by the thread incrementing the first counter, terrible false sharing results. has to be taken into account when trying to avoid false sharing.