Skip to content

Instantly share code, notes, and snippets.

@Zhentar
Last active March 1, 2024 04:35
Show Gist options
  • Save Zhentar/07b92a52c619641ab61aab50b1e5ec91 to your computer and use it in GitHub Desktop.
Save Zhentar/07b92a52c619641ab61aab50b1e5ec91 to your computer and use it in GitHub Desktop.
I knew the Span<T> stuff was supposed to be fast, but this is ridiculous!

I have a program that parses data from both delimited files and Excel spreadsheets. I was trying out Span to speed up parsing the delimited files, but the ref struct restrictions mean I can't just hide the two different file formats behind an interface (without the small added overhead of repeatedly pulling Spans from Memory).

But what if I just wrote the ASCII strings from the Excel spreadsheets into a byte buffer, so that the same Span based parser could be used with both file formats? Seems like the overhead cost could be fairly low, and the Excel parsing is already intrinsically slower because of the decompression & XML parsing costs, so I'd be willing to take a small performance hit there for a big gain on the delimited files.

BenchmarkDotNet=v0.10.14, OS=Windows 10.0.17134
Intel Core i7-6600U CPU 2.60GHz (Skylake), 1 CPU, 4 logical and 2 physical cores
.NET Core SDK=2.1.301
[Host] : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT
Clr    : .NET Framework 4.7.1 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3110.0
Core   : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT

Method Job Runtime Mean Error StdDev Median
OldWay Clr Clr 21.021 ms 0.4189 ms 0.9541 ms 21.014 ms
WithSpan Clr Clr 11.714 ms 0.2317 ms 0.4680 ms 11.565 ms
OldWay Core Core 18.547 ms 0.3694 ms 0.8108 ms 18.140 ms
WithSpan Core Core 7.676 ms 0.1505 ms 0.2714 ms 7.605 ms

Overhead? Nope, massive savings, apparently!

using System;
using System.Buffers;
using System.Buffers.Text;
using System.Collections.Generic;
using System.Linq;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Attributes.Jobs;
using BenchmarkDotNet.Running;
namespace SpanDemo
{
static class Program
{
static void Main()
{
BenchmarkRunner.Run<SumIntStrings>();
}
}
[CoreJob]
[ClrJob]
public class SumIntStrings
{
private List<string[]> _testData;
private const int NUM_COLS = 14;
[GlobalSetup]
public void Setup()
{
_testData = new List<string[]>();
var theRandom = new Random();
for (int i = 0; i < 10000; i++)
{
_testData.Add(Enumerable.Range(0, NUM_COLS).Select(_ => theRandom.Next().ToString()).ToArray());
}
}
[Benchmark]
public int OldWay()
{
int result = 0;
foreach (var line in BaselineParser.GetLineParsers(_testData))
{
for (int i = 0; i < NUM_COLS; i++)
{
result += line.TryParseInt(i) ?? 0;
}
}
return result;
}
[Benchmark]
public int WithSpan()
{
int result = 0;
using (var lineSource = new Depiecifier(_testData))
{
foreach (var line in lineSource)
{
for (int i = 0; i < NUM_COLS; i++)
{
result += line.TryParseInt(i) ?? 0;
}
}
}
return result;
}
}
static class BaselineParser
{
public static IEnumerable<BaslineLineParser> GetLineParsers(IEnumerable<string[]> lines)
{
return lines.Select(line => new BaslineLineParser(line));
}
public class BaslineLineParser
{
private string[] Pieces { get; }
public BaslineLineParser(string[] line) => Pieces = line;
public int? TryParseInt(int col)
{
if (col < Pieces.Length)
{
if (int.TryParse(Pieces[col], out var result))
{
return result;
}
}
return null;
}
}
}
public class Depiecifier : IDisposable
{
private byte[] _byteBuffer;
private int[] _indexBuffer;
readonly IEnumerable<string[]> _lineCollection;
public Depiecifier(IEnumerable<string[]> lineCollection)
{
_lineCollection = lineCollection;
_byteBuffer = ArrayPool<byte>.Shared.Rent(65536);
_indexBuffer = ArrayPool<int>.Shared.Rent(512);
}
public DepieceEnumerator GetEnumerator() => new DepieceEnumerator(_byteBuffer, _indexBuffer, _lineCollection.GetEnumerator());
public ref struct DepieceEnumerator
{
private readonly byte[] _byteBuffer;
private readonly int[] _indexBuffer;
private ReadOnlySpan<byte> _lineBytes;
private ReadOnlySpan<int> _lineIndices;
private readonly IEnumerator<string[]> _lineEnumerator;
public DepieceEnumerator(byte[] byteBuffer, int[] indexBuffer, IEnumerator<string[]> lineEnumerator)
{
_byteBuffer = byteBuffer;
_indexBuffer = indexBuffer;
_lineEnumerator = lineEnumerator;
_lineBytes = default;
_lineIndices = default;
indexBuffer[0] = 0;
}
public SpanLineParser Current => new SpanLineParser(in _lineBytes, in _lineIndices);
public bool MoveNext()
{
if (_lineEnumerator.MoveNext())
{
var linePieces = _lineEnumerator.Current;
var byteSpan = (Span<byte>)_byteBuffer;
int bytesIdx = 0;
for (int i = 0; i < linePieces.Length; i++)
{
_indexBuffer[i] = bytesIdx;
var piecesSpan = linePieces[i].AsSpan();
foreach (var piece in piecesSpan)
{
byteSpan[bytesIdx++] = (byte)piece;
}
byteSpan[bytesIdx++] = (byte)'\t';
}
_lineBytes = byteSpan.Slice(0, bytesIdx);
_lineIndices = _indexBuffer.AsSpan(0, linePieces.Length);
return true;
}
return false;
}
public readonly ref struct SpanLineParser
{
private readonly ReadOnlySpan<byte> _span;
private readonly ReadOnlySpan<int> _colStarts;
public SpanLineParser(in ReadOnlySpan<byte> memory, in ReadOnlySpan<int> colStarts)
{
_span = memory;
_colStarts = colStarts;
}
public int? TryParseInt(int col)
{
if ((uint)col < (uint)_colStarts.Length)
{
//Doesn't chop off trailing \t, to save a touch some math cost for parsing that doesn't care
var parseSpan = _span.Slice(_colStarts[col]);
Utf8Parser.TryParse(parseSpan, out int result, out _);
return result;
}
return null;
}
}
}
public void Dispose()
{
ArrayPool<byte>.Shared.Return(_byteBuffer);
_byteBuffer = null;
ArrayPool<int>.Shared.Return(_indexBuffer);
_indexBuffer = null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment