-
-
Save Sergio0694/c51cb027e6815d7b592484eebe9e3685 to your computer and use it in GitHub Desktop.
A benchmark for StringPool when parsing a large .csv file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Buffers.Text; | |
using System.IO; | |
using System.Runtime.CompilerServices; | |
using System.Text; | |
using System.Text.Unicode; | |
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Engines; | |
using BenchmarkDotNet.Running; | |
using Microsoft.Toolkit.HighPerformance; | |
using Microsoft.Toolkit.HighPerformance.Buffers; | |
using Microsoft.Toolkit.HighPerformance.Enumerables; | |
[module: SkipLocalsInit] | |
BenchmarkRunner.Run<ParsingBenchmark>(); | |
[MemoryDiagnoser] | |
[SimpleJob(RunStrategy.Monitoring)] | |
public class ParsingBenchmark | |
{ | |
private MemoryOwner<byte> sourceMemory; | |
private MemoryOwner<Data> dataMemory; | |
private readonly StringPool stringPool1 = new(); | |
private readonly StringPool stringPool2 = new(); | |
[GlobalSetup] | |
public void Setup() | |
{ | |
// Source: https://github.com/dotnet/machinelearning/blob/master/test/data/taxi-fare-train.csv | |
// Saved with UTF8 encoding | |
using Stream stream = File.OpenRead("taxi-fare-train-utf8.csv"); | |
this.sourceMemory = MemoryOwner<byte>.Allocate((int)stream.Length); | |
stream.Read(this.sourceMemory.Span); | |
this.dataMemory = MemoryOwner<Data>.Allocate(this.sourceMemory.Span.Count((byte)'\n'), AllocationMode.Clear); | |
} | |
[GlobalCleanup] | |
public void Cleanup() | |
{ | |
this.sourceMemory.Dispose(); | |
this.dataMemory.Span.Clear(); | |
this.dataMemory.Dispose(); | |
} | |
[IterationSetup] | |
public void IterationSetup() | |
{ | |
this.stringPool1.Reset(); | |
this.stringPool2.Reset(); | |
} | |
[Benchmark(Baseline = true)] | |
public void Default() | |
{ | |
var parser = new DefaultParser(); | |
Parse(ref parser); | |
} | |
[Benchmark] | |
public void StackallocGetOrAdd() | |
{ | |
var parser = new StringPoolCustomParser(this.stringPool1); | |
Parse(ref parser); | |
} | |
[Benchmark] | |
public void EmbeddedGetOrAdd() | |
{ | |
var parser = new StringPoolEmbeddedParser(this.stringPool2); | |
Parse(ref parser); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private void Parse<T>(ref T parser) | |
where T : struct, IStringParser | |
{ | |
var header = true; | |
var i = 0; | |
var dataSpan = this.dataMemory.Span; | |
foreach (var line in new ReadOnlySpanTokenizer<byte>(this.sourceMemory.Span, (byte)'\n')) | |
{ | |
if (header) | |
{ | |
header = false; | |
} | |
else | |
{ | |
ref var data = ref dataSpan[i++]; | |
var index = 0; | |
foreach (var item in new ReadOnlySpanTokenizer<byte>(line, (byte)',')) | |
{ | |
switch (index++) | |
{ | |
case 0: | |
data.VendorId = parser.ParseString(item); | |
break; | |
case 1: | |
if (Utf8Parser.TryParse(item, out byte rateCode, out _)) | |
{ | |
data.RateCode = rateCode; | |
} | |
break; | |
case 2: | |
if (Utf8Parser.TryParse(item, out byte passengerCount, out _)) | |
{ | |
data.PassengerCount = passengerCount; | |
} | |
break; | |
case 3: | |
if (Utf8Parser.TryParse(item, out short tripTimeInSecs, out _)) | |
{ | |
data.TripTimeInSecs = tripTimeInSecs; | |
} | |
break; | |
case 4: | |
if (Utf8Parser.TryParse(item, out float tripDistance, out _)) | |
{ | |
data.TripDistance = tripDistance; | |
} | |
break; | |
case 5: | |
data.PaymentType = parser.ParseString(item); | |
break; | |
case 6: | |
if (Utf8Parser.TryParse(item, out float fareAmount, out _)) | |
{ | |
data.FareAmount = fareAmount; | |
} | |
break; | |
} | |
} | |
} | |
} | |
} | |
} | |
public interface IStringParser | |
{ | |
string ParseString(ReadOnlySpan<byte> span); | |
} | |
public readonly struct DefaultParser : IStringParser | |
{ | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public string ParseString(ReadOnlySpan<byte> span) | |
{ | |
return Encoding.UTF8.GetString(span); | |
} | |
} | |
public unsafe struct StringPoolCustomParser : IStringParser | |
{ | |
private readonly StringPool pool; | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public StringPoolCustomParser(StringPool pool) | |
{ | |
this.pool = pool; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public string ParseString(ReadOnlySpan<byte> span) | |
{ | |
Span<char> buffer = stackalloc char[span.Length]; | |
Utf8.ToUtf16(span, buffer, out _, out int length); | |
return this.pool.GetOrAdd(buffer.Slice(0, length)); | |
} | |
} | |
public readonly struct StringPoolEmbeddedParser : IStringParser | |
{ | |
private readonly StringPool pool; | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public StringPoolEmbeddedParser(StringPool pool) | |
{ | |
this.pool = pool; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public string ParseString(ReadOnlySpan<byte> span) | |
{ | |
return this.pool.GetOrAdd(span, Encoding.UTF8); | |
} | |
} | |
public struct Data | |
{ | |
public string VendorId; | |
public byte RateCode; | |
public byte PassengerCount; | |
public short TripTimeInSecs; | |
public float TripDistance; | |
public string PaymentType; | |
public float FareAmount; | |
public override string ToString() | |
{ | |
return $"{VendorId},{RateCode},{PassengerCount},{TripTimeInSecs},{TripDistance},{PaymentType},{FareAmount}"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment