Skip to content

Instantly share code, notes, and snippets.

@Sergio0694
Last active March 11, 2021 16:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Sergio0694/c51cb027e6815d7b592484eebe9e3685 to your computer and use it in GitHub Desktop.
Save Sergio0694/c51cb027e6815d7b592484eebe9e3685 to your computer and use it in GitHub Desktop.
A benchmark for StringPool when parsing a large .csv file
using System;
using System.Buffers.Text;
using System.IO;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Unicode;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Running;
using Microsoft.Toolkit.HighPerformance;
using Microsoft.Toolkit.HighPerformance.Buffers;
using Microsoft.Toolkit.HighPerformance.Enumerables;
[module: SkipLocalsInit]
BenchmarkRunner.Run<ParsingBenchmark>();
[MemoryDiagnoser]
[SimpleJob(RunStrategy.Monitoring)]
public class ParsingBenchmark
{
private MemoryOwner<byte> sourceMemory;
private MemoryOwner<Data> dataMemory;
private readonly StringPool stringPool1 = new();
private readonly StringPool stringPool2 = new();
[GlobalSetup]
public void Setup()
{
// Source: https://github.com/dotnet/machinelearning/blob/master/test/data/taxi-fare-train.csv
// Saved with UTF8 encoding
using Stream stream = File.OpenRead("taxi-fare-train-utf8.csv");
this.sourceMemory = MemoryOwner<byte>.Allocate((int)stream.Length);
stream.Read(this.sourceMemory.Span);
this.dataMemory = MemoryOwner<Data>.Allocate(this.sourceMemory.Span.Count((byte)'\n'), AllocationMode.Clear);
}
[GlobalCleanup]
public void Cleanup()
{
this.sourceMemory.Dispose();
this.dataMemory.Span.Clear();
this.dataMemory.Dispose();
}
[IterationSetup]
public void IterationSetup()
{
this.stringPool1.Reset();
this.stringPool2.Reset();
}
[Benchmark(Baseline = true)]
public void Default()
{
var parser = new DefaultParser();
Parse(ref parser);
}
[Benchmark]
public void StackallocGetOrAdd()
{
var parser = new StringPoolCustomParser(this.stringPool1);
Parse(ref parser);
}
[Benchmark]
public void EmbeddedGetOrAdd()
{
var parser = new StringPoolEmbeddedParser(this.stringPool2);
Parse(ref parser);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void Parse<T>(ref T parser)
where T : struct, IStringParser
{
var header = true;
var i = 0;
var dataSpan = this.dataMemory.Span;
foreach (var line in new ReadOnlySpanTokenizer<byte>(this.sourceMemory.Span, (byte)'\n'))
{
if (header)
{
header = false;
}
else
{
ref var data = ref dataSpan[i++];
var index = 0;
foreach (var item in new ReadOnlySpanTokenizer<byte>(line, (byte)','))
{
switch (index++)
{
case 0:
data.VendorId = parser.ParseString(item);
break;
case 1:
if (Utf8Parser.TryParse(item, out byte rateCode, out _))
{
data.RateCode = rateCode;
}
break;
case 2:
if (Utf8Parser.TryParse(item, out byte passengerCount, out _))
{
data.PassengerCount = passengerCount;
}
break;
case 3:
if (Utf8Parser.TryParse(item, out short tripTimeInSecs, out _))
{
data.TripTimeInSecs = tripTimeInSecs;
}
break;
case 4:
if (Utf8Parser.TryParse(item, out float tripDistance, out _))
{
data.TripDistance = tripDistance;
}
break;
case 5:
data.PaymentType = parser.ParseString(item);
break;
case 6:
if (Utf8Parser.TryParse(item, out float fareAmount, out _))
{
data.FareAmount = fareAmount;
}
break;
}
}
}
}
}
}
public interface IStringParser
{
string ParseString(ReadOnlySpan<byte> span);
}
public readonly struct DefaultParser : IStringParser
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public string ParseString(ReadOnlySpan<byte> span)
{
return Encoding.UTF8.GetString(span);
}
}
public unsafe struct StringPoolCustomParser : IStringParser
{
private readonly StringPool pool;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public StringPoolCustomParser(StringPool pool)
{
this.pool = pool;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public string ParseString(ReadOnlySpan<byte> span)
{
Span<char> buffer = stackalloc char[span.Length];
Utf8.ToUtf16(span, buffer, out _, out int length);
return this.pool.GetOrAdd(buffer.Slice(0, length));
}
}
public readonly struct StringPoolEmbeddedParser : IStringParser
{
private readonly StringPool pool;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public StringPoolEmbeddedParser(StringPool pool)
{
this.pool = pool;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public string ParseString(ReadOnlySpan<byte> span)
{
return this.pool.GetOrAdd(span, Encoding.UTF8);
}
}
public struct Data
{
public string VendorId;
public byte RateCode;
public byte PassengerCount;
public short TripTimeInSecs;
public float TripDistance;
public string PaymentType;
public float FareAmount;
public override string ToString()
{
return $"{VendorId},{RateCode},{PassengerCount},{TripTimeInSecs},{TripDistance},{PaymentType},{FareAmount}";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment