Last active
March 30, 2017 00:28
-
-
Save Flash3001/d50a6b43bba7bc61e3d85734e40dbed9 to your computer and use it in GitHub Desktop.
Distinct a list of strings ignoring diacritics and cases
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Running; | |
using System; | |
using System.Collections.Generic; | |
using System.Globalization; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace UnicodeDistinct | |
{ | |
public class Program | |
{ | |
#region Comparers | |
class LengthHash : IEqualityComparer<string> | |
{ | |
public bool Equals(string x, string y) | |
{ | |
if (x == null || y == null) return x == y; | |
var xt = x.Trim(); | |
var yt = y.Trim(); | |
return xt.Length == yt.Length && Culture.CompareInfo.IndexOf(xt, yt, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) >= 0; | |
} | |
public int GetHashCode(string obj) => obj?.Trim().Length ?? 1; | |
} | |
class IgnoreHash : IEqualityComparer<string> | |
{ | |
public bool Equals(string x, string y) | |
{ | |
if (x == null || y == null) return x == y; | |
var xt = x.Trim(); | |
var yt = y.Trim(); | |
return xt.Length == yt.Length && Culture.CompareInfo.IndexOf(xt, yt, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) >= 0; | |
} | |
public int GetHashCode(string obj) => 1; | |
} | |
class NormalizedHash : IEqualityComparer<string> | |
{ | |
public bool Equals(string x, string y) | |
{ | |
if (x == null || y == null) return x == y; | |
var xt = x.Trim(); | |
var yt = y.Trim(); | |
return xt.Length == yt.Length && Culture.CompareInfo.IndexOf(xt, yt, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) >= 0; | |
} | |
public int GetHashCode(string obj) => obj?.Trim().Normalize().ToUpperInvariant().GetHashCode() ?? 1; | |
} | |
#endregion | |
#region Benchmarks | |
[Benchmark] | |
public List<String> RunDefault() => Itens.Distinct().ToList(); | |
[Benchmark] | |
public List<String> RunHashAsLength() => Itens.Distinct(new LengthHash()).ToList(); | |
[Benchmark] | |
public List<String> RunIgnoreHash() => Itens.Distinct(new IgnoreHash()).ToList(); | |
[Benchmark] | |
public List<String> RunNormalizedHash() => Itens.Distinct(new NormalizedHash()).ToList(); | |
[Benchmark] | |
public List<String> RunTrimAndCompareWithStringComparer() => Itens.Select(c => c?.Trim()).Distinct(StringComparer.InvariantCultureIgnoreCase).ToList(); | |
#endregion | |
static CultureInfo Culture = CultureInfo.CurrentCulture; | |
static readonly string[] Itens = new string[] { "hi", " hi ", "HI", "hí", " Hî", "hi hi", " hí hí ", "olá", "OLÁ", " olá ", "", "ola", "hola", " holà ", "aaaa", "áâàa", " aâàa ", "áaàa", "áâaa ", "aaaa ", "áâaa", "áâaa", }; | |
static void Main(string[] args) | |
{ | |
BenchmarkRunner.Run<Program>(); | |
var p = new Program(); | |
Console.WriteLine($"{p.RunDefault().Count} Default: {string.Join(", ", p.RunDefault())}"); | |
Console.WriteLine($"{p.RunHashAsLength().Count} HashAsLength: {string.Join(", ", p.RunHashAsLength())}"); | |
Console.WriteLine($"{p.RunIgnoreHash().Count} IgnoreHash: {string.Join(", ", p.RunIgnoreHash())}"); | |
Console.WriteLine($"{p.RunNormalizedHash().Count} NormalizedHash: {string.Join(", ", p.RunNormalizedHash())}"); | |
Console.WriteLine($"{p.RunTrimAndCompareWithStringComparer().Count} RunTrimAndCompareWithStringComparer: {string.Join(", ", p.RunTrimAndCompareWithStringComparer())}"); | |
Console.ReadLine(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment