Skip to content

Instantly share code, notes, and snippets.

@altbodhi
Created August 22, 2022 02:55
Show Gist options
  • Save altbodhi/690dfc2c8e70a579c15fe04f72b83bf5 to your computer and use it in GitHub Desktop.
Save altbodhi/690dfc2c8e70a579c15fe04f72b83bf5 to your computer and use it in GitHub Desktop.
Very simple code for sorting big text file in csharp
/*
2_000_000 => 16 sec
5_ => 32
10_ => 1:10
20_ => 2:58
40_ => 13:44 file size = 2.5 Gb
*/
using System.IO;
using System.Diagnostics;
using static System.Console;
int count = args.Length > 0 && int.TryParse(args[0], out var val) ? val : 2_000_000;
var sw = Stopwatch.StartNew();
var alf = get_alf();
if (Directory.Exists(Constants.TMP))
Directory.Delete(Constants.TMP, true);
Directory.CreateDirectory(Constants.TMP);
var comp = new SimpleComparer();
do_it(count);
sw.Stop();
WriteLine(sw.Elapsed);
char[] get_alf()
{
var list = new List<char>();
for (char x = 'A'; x <= 'Z'; x++)
list.Add(x);
for (char x = 'a'; x <= 'z'; x++)
list.Add(x);
list.Add(' ');
return list.ToArray();
}
string gen_str()
{
var range = Enumerable.Range(0, Random.Shared.Next(20, 100));
var chars = range.Select(x => alf[Random.Shared.Next(0, alf.Length)]).ToArray();
return new string(chars);
}
int gen_num() => Random.Shared.Next(0, 10_000);
void do_it(int count)
{
var source = $"L{count}.txt";
File.WriteAllLines(source, Enumerable.Range(0, count).Select(x => $"{gen_num()}.{gen_str()}"));
int num = 0;
foreach (var chunk in File.ReadAllLines(source).Chunk(200_000))
File.WriteAllLines(Path.Combine($".\\{Constants.TMP}\\", $"{(++num):D5}.txt"), chunk.OrderBy(x => x, comp));
SortFiles($"S{count}.txt");
}
void SortFiles(string outputFile)
{
var readers = Directory.GetFiles(Path.Combine($".\\{Constants.TMP}\\")).Select(x => new StreamReader(x));
try
{
var lines = readers.Select(x => new TxtReader
{
Line = x.ReadLine(),
Reader = x
}).OrderBy(x => x.Line, comparer: comp).ToList();
using var writer = new StreamWriter(outputFile);
Reorder();
void Reorder()
{
if (lines.Count == 1)
return;
int i = 0;
while (comp.Compare(lines[i].Line, lines[i + 1].Line) > 0)
{
var t = lines[i];
lines[i] = lines[i + 1];
lines[i + 1] = t;
i++;
if (i + 1 == lines.Count)
return;
}
}
while (lines.Count > 0)
{
var current = lines[0];
writer.WriteLine(current.Line);
if (current.Reader.EndOfStream)
{
lines.Remove(current);
WriteLine($"{DateTime.Now:s} : {lines.Count}");
continue;
}
current.Line = current.Reader.ReadLine();
Reorder();
}
}
finally
{
foreach (var r in readers)
r.Dispose();
}
}
public class TxtReader
{
public string Line;
public StreamReader Reader;
}
class SimpleComparer : IComparer<string>
{
public int Compare(string s1, string s2)
{
var p1 = s1.IndexOf('.');
var v1 = s1.AsSpan(p1 + 2);
var p2 = s2.IndexOf('.');
var v2 = s2.AsSpan(p2 + 2);
var vc =
v1.CompareTo(v2, StringComparison.Ordinal);
if (vc != 0)
return vc;
else
{
var n1 = Int32.Parse(s1.AsSpan(0, p1));
var n2 = Int32.Parse(s2.AsSpan(0, p2));
return n1.CompareTo(n2);
}
}
}
public static class Constants
{
public const string TMP = "tmp";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment