Skip to content

Instantly share code, notes, and snippets.

@ChuckSavage
Last active January 27, 2022 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ChuckSavage/e1c61839300abd2d90873b01da9d7575 to your computer and use it in GitHub Desktop.
Save ChuckSavage/e1c61839300abd2d90873b01da9d7575 to your computer and use it in GitHub Desktop.
Natural string comparer for files too
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Numerics;
using System.Text;
using System.Text.RegularExpressions;
namespace SeaRisenLib2.Text;
public class NaturalStringComparer : IComparer<string>
{
// original: @"(?<=\D)(?=\d)|(?<=\d)(?=\D)"
/*
* Deciphering the regex pattern - (?<=_)(?=_)(?<=\W)|(?=\W)|(?<=\D)(?=\d)|(?<=\d)(?=\D)
* since _ is a part of D (letters), check for it first
* (?<=\W) - if the previous character is a non-word *-+=)
* (?=\W) - or if the next character is a non-word
* (?<=\D)(?=\d) - or if the previous character is a non-digit ABC, or the next character is a digit 123
* (?<=\d)(?=\D) - or if the previous character is a digit 123, or the next character is not a digit ABC
*/
private static readonly Regex _re = new Regex(@"(?<=_)|(?=_)|(?<=\W)|(?=\W)|(?<=\D)(?=\d)|(?<=\d)(?=\D)", RegexOptions.Compiled);
private static readonly Regex _reAlnum = new Regex(@"[^\w]|[_]", RegexOptions.Compiled);
private NaturalStringComparer() { }
public static NaturalStringComparer Instance { get; } = new();
public int Compare(string x, string y)
{
return Compare(x, y, false);
}
/// <summary>
/// Compare two strings, optionally alphanumeric compare only. Returns a value indicating
/// one is less than, greather or equal to the other.
/// </summary>
/// <param name="x"></param>
/// <param name="y"></param>
/// <param name="alphanumericOnly"></param>
/// <returns></returns>
public int Compare(string x, string y, bool alphanumericOnly)
{
if (string.IsNullOrWhiteSpace(x))
if (string.IsNullOrWhiteSpace(y))
return 0; // both null
else
return -1; // x is null but y contains a value
else if (string.IsNullOrWhiteSpace(y)) // if y is null, but x contains a value
return 1;
int r = string.Compare(x, y, StringComparison.CurrentCultureIgnoreCase);
if (r == 0) return 0; // they're equal, no need to fine tune the comparison
var fX = FileParts(x);
var fY = FileParts(y);
if (fX.extension == "" || fY.extension == "")
return Compare2(x, y, alphanumericOnly);
r = string.Compare(fX.name, fY.name, StringComparison.CurrentCultureIgnoreCase);
// if equal, compare extensions and be done with it
if (r != 0)
{
r = Compare2(fX.name, fY.name, alphanumericOnly);
if (r != 0) return r;
}
return string.Compare(fX.extension, fY.extension, StringComparison.CurrentCultureIgnoreCase);
// compare by enumerating the parts, instead of (the old vs of) getting the full part list for each string on each compare
static int Compare2(string x, string y, bool alphanumericOnly)
{
IEnumerable<string> px = Parse(x), py = Parse(y);
IEnumerator<string> ex = px.GetEnumerator(), ey = py.GetEnumerator();
bool bx = ex.MoveNext(), by = ey.MoveNext();
int r;
// first compare only alphanumeric values
try
{
char cx, cy;
while (bx && by)
{
redocx:
cx = ex.Current[0];
if (bx && !char.IsLetterOrDigit(cx))
{
bx = ex.MoveNext();
goto redocx;
}
redocy:
cy = ey.Current[0];
if (by && !char.IsLetterOrDigit(cy))
{
by = ey.MoveNext();
goto redocy;
}
if (!bx || !by) break;
if (char.IsLetter(cx) || char.IsLetter(cy))
{
r = string.Compare(ex.Current, ey.Current, StringComparison.CurrentCultureIgnoreCase);
if (r != 0) return r;
}
// both should be numbers?
else if (BigInteger.TryParse(ex.Current, out BigInteger ix) && BigInteger.TryParse(ey.Current, out BigInteger iy))
{
r = ix.CompareTo(iy);
if (r != 0) return r;
r = CompareLength(ex.Current, ey.Current);
if (r != 0) return r;
}
bx = ex.MoveNext();
by = ey.MoveNext();
}
}
finally
{
ex.Dispose();
ey.Dispose();
}
if (bx) return 1;
if (by) return -1;
if (alphanumericOnly) return 0; // both were equal
// if bx and by are both false, then the strings without the separators are equal, now
// compare the non-alphanumeric values too
ex = px.GetEnumerator(); // ex.Reset() doesn't work, so (dispose above and) get the enumerator again
ey = py.GetEnumerator();
bx = ex.MoveNext();
by = ey.MoveNext();
try
{
while (bx && by)
{
if (BigInteger.TryParse(ex.Current, out BigInteger ix) && BigInteger.TryParse(ey.Current, out BigInteger iy))
{
r = ix.CompareTo(iy);
if (r != 0) return r;
r = CompareLength(ex.Current, ey.Current);
}
else
r = string.Compare(ex.Current, ey.Current, StringComparison.CurrentCultureIgnoreCase);
if (r != 0) return r;
bx = ex.MoveNext();
by = ey.MoveNext();
}
}
finally
{
ex.Dispose();
ey.Dispose();
}
if (bx) return 1;
if (by) return -1;
return 0;
// enumerate the string, returning only what we need to get the comparison
IEnumerable<string> Parse(string IN)
{
int t = -1; // 0 word, 1 number, 2 whitespace or other non-alphanumeric value
int s = 0; // start of part
for (int i = 0; i < IN.Length; i++)
{
switch (t)
{
case 0:
if (char.IsLetter(IN[i]))
continue;
break;
case 1:
if (char.IsDigit(IN[i]))
continue;
break;
case 2:
if (!char.IsLetterOrDigit(IN[i])) // whitespace or other non-alphanumeric value
continue;
break;
default:
if (char.IsLetter(IN[i])) t = 0;
else if (char.IsDigit(IN[i])) t = 1;
else t = 2;
continue;
}
yield return IN.Substring(s, i - s);
s = i--; // i needs to point to previous char, after updating s
t = -1;
}
if (s < IN.Length)
yield return IN.Substring(s, IN.Length - s);
}
}
}
/// <summary>
/// Naturally compare two values, by default making it an alphanumeric compare.
/// </summary>
/// <param name="IN"></param>
/// <param name="pattern"></param>
/// <param name="alphanumericOnly"></param>
/// <returns></returns>
public static bool Contains(string IN, string pattern, bool alphanumericOnly = true)
{
if (IN.Length < pattern.Length)
return false;
int length = IN.Length - pattern.Length;
for (int i = 0; i <= length; i++)
{
if (0 == Instance.Compare(IN.Substring(i, pattern.Length), pattern, alphanumericOnly))
return true;
}
return false;
}
/// <summary>
/// Naturally compare two values, by default making it an alphanumeric compare.
/// </summary>
/// <param name="IN"></param>
/// <param name="pattern"></param>
/// <param name="alphanumericOnly"></param>
/// <returns></returns>
public static bool EndsWith(string IN, string pattern, bool alphanumericOnly = true)
{
if (IN.Length < pattern.Length)
return false;
return 0 == Instance.Compare(IN.SubstringSafe(IN.Length - pattern.Length, pattern.Length), pattern, alphanumericOnly);
}
/// <summary>
/// Naturally compare two values, by default making it an alphanumeric compare.
/// </summary>
/// <param name="IN"></param>
/// <param name="pattern"></param>
/// <param name="alphanumericOnly"></param>
/// <returns></returns>
public static bool StartsWith(string IN, string pattern, bool alphanumericOnly = true)
{
if (IN.Length < pattern.Length)
return false;
return 0 == Instance.Compare(IN.SubstringSafe(0, pattern.Length), pattern, alphanumericOnly);
}
// based off of stackoverflow version
public int Compare_OriginalSlowerVersion(string x, string y)
{
if (string.IsNullOrWhiteSpace(x))
if (string.IsNullOrWhiteSpace(y))
return 0; // both null
else
return -1; // x is null but y contains a value
if (string.IsNullOrWhiteSpace(y)) return 1; // y is null but x contains a value
var fX = FileParts(x);
var fY = FileParts(y);
if (fX.extension == "" || fY.extension == "")
return Compare2(x, y);
int r = Compare2(fX.name, fY.name);
if (r != 0) return r;
return Compare2(fX.extension, fY.extension);
static int Compare2(string x, string y, bool alphanum = true)
{
string xOrig = x, yOrig = y;
// compare first on only alphanumeric values, then if no differences
// compare the delimiters
//if (Debugger.IsAttached && new[] { x, y }.Any(s => s.Contains("2-4RedHead+286029")))
// Debugger.Break();
x = x.ToUpperInvariant();
y = y.ToUpperInvariant();
if (alphanum)
{
AlphaNumericOnly(ref x);
AlphaNumericOnly(ref y);
}
var cl = CompareSubstring(x, y);
if (cl.success && cl.result != 0) return cl.result;
// if substring values are equal, dig deeper
var a = SplitIntoParts(x, alphanum);
var b = SplitIntoParts(y, alphanum);
int i = 0;
while (i < a.Length && i < b.Length)
{
var (isInt, result) = CompareValue(a[i], b[i]);
if (isInt && result == 0)
{
// if is int, compare say 001 vs 1
result = CompareLength(a[i], b[i]);
}
if (result != 0) return result;
// if equal in value, dig deeper
++i;
}
i = 0;
while (i < a.Length && i < b.Length)
{
cl = CompareSubstring(a[i], b[i]);
if (cl.success && cl.result != 0) return cl.result;
// if parts are equal, check deeper parts
++i;
}
// re-run first compare, returning whatever result we get
var r = CompareSubstring(x, y).result;
if (!alphanum || r != 0) return r;
return Compare2(xOrig, yOrig, false);
}
static (bool success, int result) CompareSubstring(string x, string y)
{
int result = string.Compare(x, 0, y, 0, Math.Min(x.Length, y.Length));
bool isInt = false;
if (result != 0)
(isInt, result) = CompareValue(x, y);
if (isInt)
if (result == 0)
return (true, CompareLength(x, y));
else
return (true, result);
return (false, CompareLength(x, y));
}
static (bool isInt, int result) CompareValue(string x, string y)
{
if (BigInteger.TryParse(x, out BigInteger a) && BigInteger.TryParse(y, out BigInteger b))
return (true, a.CompareTo(b));
return (false, x.CompareTo(y));
}
static void AlphaNumericOnly(ref string IN)
{
IN = _reAlnum.Replace(IN, ",");
IN = Regex.Replace(IN, ",+", ",");
IN = IN.Trim(',');
}
static string[] SplitIntoParts(string IN, bool alphanum)
{
return _re.Split(IN)
// don't sort commas with other values (or themselves)
.Where(s => !alphanum || s != ",")
.ToArray();
}
}
static int CompareLength(string x, string y)
{
if (x.Length != y.Length)
return x.Length < y.Length ? -1 : 1;
return 0;
}
/// <summary>
/// Get the fullname, name and extension of the path as separate parts.
/// </summary>
/// <param name="path"></param>
/// <returns></returns>
static (string name, string extension) FileParts(string path)
{
string extension = Path.GetExtension(path);
if (string.IsNullOrEmpty(extension))
return (path, extension);
string name = Path.GetFileNameWithoutExtension(path);
string fullname = Path.Combine(Path.GetDirectoryName(path), name);
return (fullname, extension);
}
}
========== UNIT TESTS go in separate file within a test project ==============
using System;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.Linq;
using SeaRisenLib2.Text;
namespace TestProject
{
[TestClass]
public class NaturalStringComparerTest
{
[TestMethod]
public void TestMethod1()
{
string[] list = { "01", "1", "2", "001", "002" };
string[] expected = { "1", "01", "001", "2", "002" };
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod2()
{
string[] list = {
"01.jpg",
"1.png",
"2.png",
"001.png",
"002.jpg",
"01.png",
"2.jpg"
};
string[] expected = {
"1.png",
"01.jpg",
"01.png",
"001.png",
"2.jpg",
"2.png",
"002.jpg"
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod3()
{
string[] list = {
"image 12.jpg",
"image-1.jpg",
"image_11.jpg",
"image+2.jpg",
"image,20.jpg",
};
string[] expected = {
"image-1.jpg",
"image+2.jpg",
"image_11.jpg",
"image 12.jpg",
"image,20.jpg",
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod4()
{
string[] list = {
"0_c1c9c_d3dd2e76_orig 2.psd",
"0_c1c76_ebedbd57_orig.jpg",
"0_c1c77_d2c42e68_orig.jpg",
"0_c1c9c_d3dd2e76_orig.jpg",
"0_c1c85_1840d3b_orig.jpg",
"0_c1c92_a4060a6d_orig.jpg",
"0_c1c94_68939054_orig.jpg",
"0_c1c97_9a75dd3f_orig.jpg",
"0_c1ca1_c559fedc_orig.jpg",
"0_c79a4_90493d35_orig.jpg",
"0_9ddd7_bc745e8c_XXXL.JPG",
"0_9ddd8_7491ab8c_XXL.JPG",
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel (1).JPG",
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel.JPG",
"0_c1c8a_f69af695_orig.jpg",
};
string[] expected = {
"0_9ddd7_bc745e8c_XXXL.JPG",
"0_9ddd8_7491ab8c_XXL.JPG",
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel.JPG",
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel (1).JPG",
"0_c1c8a_f69af695_orig.jpg",
"0_c1c9c_d3dd2e76_orig.jpg",
"0_c1c9c_d3dd2e76_orig 2.psd",
"0_c1c76_ebedbd57_orig.jpg",
"0_c1c77_d2c42e68_orig.jpg",
"0_c1c85_1840d3b_orig.jpg",
"0_c1c92_a4060a6d_orig.jpg",
"0_c1c94_68939054_orig.jpg",
"0_c1c97_9a75dd3f_orig.jpg",
"0_c1ca1_c559fedc_orig.jpg",
"0_c79a4_90493d35_orig.jpg",
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod5()
{
string[] list = {
"1+quite+artsy+288829.webp",
"1+quite+artsy+288829_result.jpg",
"1+paint+28229.webp",
"1-lion-01.webp",
"1-lion-11.webp",
"1-lion-36-1160x2606.webp",
"1-lion-41.webp",
"01.jpg",
"110208043338528059.jpg",
"100122859906.jpg",
"1303190918328716010989741.jpg",
"1303190918348716010989742.jpg",
"12154530903_f5cfcdab52_b.jpg",
"2-4RedHead+286029.webp",
"2+beach+fun+281529.webp",
};
string[] expected = {
"1-lion-01.webp",
"1-lion-11.webp",
"1-lion-36-1160x2606.webp",
"1-lion-41.webp",
"1+paint+28229.webp",
"1+quite+artsy+288829.webp",
"1+quite+artsy+288829_result.jpg",
"01.jpg",
"2-4RedHead+286029.webp",
"2+beach+fun+281529.webp",
"12154530903_f5cfcdab52_b.jpg",
"100122859906.jpg",
"110208043338528059.jpg",
"1303190918328716010989741.jpg",
"1303190918348716010989742.jpg",
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod6()
{
string[] list = {
"1+paint_28228.webp",
"2-4RedHead+286029.webp",
"2+beach+fun+281529.webp",
"1-lion-01.webp",
"1-paint+28229.webp",
};
string[] expected = {
"1-lion-01.webp",
"1+paint_28228.webp",
"1-paint+28229.webp",
"2-4RedHead+286029.webp",
"2+beach+fun+281529.webp",
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod7()
{
string[] list = {
"0003-2af55d6.jpg",
"3_67.jpg",
"03.jpg",
"3+ten.jpg",
};
string[] expected = {
"3_67.jpg",
"3+ten.jpg",
"03.jpg",
"0003-2af55d6.jpg",
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod8()
{
string[] list = {
"7_",
"7",
"_7",
};
string[] expected = {
"_7",
"7",
"7_",
};
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray();
CollectionAssert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod9()
{
string[] list = {
@"c:\path\to\file\42.jpg",
"42.jpg",
};
int expected = 1;
int actual = NaturalStringComparer.Instance.Compare(list[0], list[1]);
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod10()
{
string[] list = {
@"c:\path\to\file\42.jpg",
@"d:\path\to\file\42.jpg",
};
int expected = -1;
int actual = NaturalStringComparer.Instance.Compare(list[0], list[1]);
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod11()
{
string[] list = {
@"contains",
@"contains",
};
bool expected = true;
bool actual = NaturalStringComparer.Contains(list[0], list[1]);
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod12()
{
string[] list = {
@"startswith",
@"startswith",
};
bool expected = true;
bool actual = NaturalStringComparer.StartsWith(list[0], list[1]);
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void TestMethod13()
{
string[] list = {
@"endswith",
@"endswith",
};
bool expected = true;
bool actual = NaturalStringComparer.EndsWith(list[0], list[1]);
Assert.AreEqual(expected, actual);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment