Skip to content

Instantly share code, notes, and snippets.

@AdibSurani
Created February 7, 2020 21:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AdibSurani/cab4aa7fb16a4924e6f174eb0ea021a7 to your computer and use it in GitHub Desktop.
Save AdibSurani/cab4aa7fb16a4924e6f174eb0ea021a7 to your computer and use it in GitHub Desktop.
var bytes = File.ReadAllBytes("vt1.file1.proper.bin");
$"Uncompressed size is {bytes.Length}".Dump();
var compressed = Compress(bytes);
$"Compressed size is {compressed.Length} ({100.0 * compressed.Length / bytes.Length:0.0}%)".Dump();
$"Round-trip decompression succeeded: {Decompress(compressed).SequenceEqual(bytes)}".Dump();
// run with LINQPad 6
///////////////////////
byte[] Compress(byte[] input)
{
// prelude
if (input.Length < 2) throw new ArgumentException("Input needs to be at least 2 bytes");
var thresholds = new[]{4, 7, 14, 21, 28}.Select(n => (1 << n) - 1).ToList();
var tail2 = new int[input.Length];
var tail3 = new int[input.Length];
tail2[input.Length - 1] = tail3[input.Length - 1] = tail3[input.Length - 2] = -1;
var nodes = new (int score, bool isMatchStage, int currentRun, int parent)[input.Length + 1];
nodes[0].isMatchStage = true;
// populate two-byte and three-byte history
var head2 = new int[1 << 16];
for (int i = 0; i < head2.Length; i++) head2[i] = -1;
var head3 = new int[1 << 24];
for (int i = 0; i < head3.Length; i++) head3[i] = -1;
for (int i = 0; i < input.Length - 1; i++)
{
var hash = input[i] | (input[i + 1] << 8);
(tail2[i], head2[hash]) = (head2[hash], i);
if (i < input.Length - 2)
{
hash |= input[i + 2] << 16;
(tail3[i], head3[hash]) = (head3[hash], i);
}
}
// find all possible matches
var matches = Enumerable.Range(0, input.Length).AsParallel().AsOrdered().Select(pos => FindMatches(pos).ToList()).ToList();
IEnumerable<(int length, int offset)> FindMatches(int pos)
{
if (tail2[pos] != tail3[pos]) yield return (2, pos - tail2[pos]);
for (int i = tail3[pos], bestLength = 2; i >= 0; i = tail3[i])
{
if (input[i + bestLength] != input[pos + bestLength]) continue;
int j = 2;
while (++j < input.Length - pos && input[i + j] == input[pos + j]) ;
if (j > bestLength)
{
bestLength = j;
yield return (j, pos - i);
if (pos + j >= input.Length) yield break;
}
}
}
// auxiliary helper functions for finding an optimal path
int BitLength(int n)
{
int c = 1;
while ((n >>= 1) != 0) c++;
return c;
}
int GetWeight(int length, int offset)
{
var lengthExtra = length <= 16 ? 0 : (BitLength(length - 1) + 6) / 7;
var offsetExtra = (BitLength(offset - 1) + 3) / 7;
return 1 + lengthExtra + offsetExtra;
}
// psuedo-optimal path finder, using a 4-variable heuristic
for (int i = 0; i < input.Length; i++)
{
var curr = nodes[i];
void Update(int length, int weight, bool matchStage)
{
var newNode = (curr.score + weight, matchStage, curr.isMatchStage == matchStage ? curr.currentRun + 1 : 1, i);
ref var node = ref nodes[i + length];
if (node.score == 0 || newNode.CompareTo(node) < 0) node = newNode;
}
var (arr, atThreshold) = (matches[i], thresholds.Contains(curr.currentRun));
Update(1, curr.isMatchStage || atThreshold ? 2 : 1, false);
if (arr.Any())
for (int j = arr.Last().length, k = arr.Count - 1; j >= 2; j--, j = j > 32 ? j / 2 : j)
{
while (k > 0 && arr[k - 1].length >= j) k--;
Update(j, GetWeight(j, arr[k].offset) + (curr.isMatchStage && atThreshold ? 1 : 0), true);
}
}
// more auxiliary helper functions, this time for constructing the byte sequence
byte[] VLCBytes(int n)
{
var tmp = new Stack<byte>();
tmp.Push((byte)(n << 1 | 1));
n >>= 7;
while (n != 0)
{
tmp.Push((byte)(n << 1));
n >>= 7;
}
return tmp.ToArray();
}
byte[] MakeLiteralCopies(int literal, int copies)
{
int litNibble = 0, copNibble = 0;
byte[] litExtra = new byte[0], copExtra = new byte[0];
if (literal < 16) litNibble = literal;
else litExtra = VLCBytes(literal);
if (copies < 16) copNibble = copies;
else copExtra = VLCBytes(copies);
if (copies == 0) copExtra = VLCBytes(copies); // special case where last byte is literal
return new[]{(byte)(litNibble | (copNibble << 4))}.Concat(litExtra).Concat(copExtra).ToArray();
}
byte[] MakeLengthOffset(int length, int offset)
{
(length, offset) = (length - 1, offset - 1);
int lenNibble = 0, offNibble = 0;
byte[] lenExtra = new byte[0], offExtra = new byte[0];
if (length < 16) lenNibble = length;
else lenExtra = VLCBytes(length);
if (offset < 8) offNibble = offset << 1 | 1;
else
{
var totalBits = BitLength(offset);
var extraSize = (totalBits + 3) / 7;
var bitsToShift = 7 * extraSize;
var high = offset >> bitsToShift;
var low = offset & ((1 << bitsToShift) - 1);
offNibble = high << 1;
offExtra = VLCBytes(low | 127 << bitsToShift).Skip(1).ToArray();
}
return new[]{(byte)(offNibble | (lenNibble << 4))}.Concat(offExtra).Concat(lenExtra).ToArray();
}
// actually constructing the byte sequence
var stack = new Stack<byte[]>();
for (int n = input.Length, matchRun = 0; n > 0;)
{
var node = nodes[n];
if (node.isMatchStage)
{
var length = n - node.parent;
n = node.parent;
stack.Push(MakeLengthOffset(length, matches[n].First(x => x.length >= length).offset));
matchRun++;
}
else
{
n -= node.currentRun;
stack.Push(input[n..(n + node.currentRun)]);
stack.Push(MakeLiteralCopies(node.currentRun, matchRun));
matchRun = 0;
}
}
return new[]{input.Length, 25, 0}.Select(VLCBytes).Concat(stack).SelectMany(x => x).ToArray();
}
byte[] Decompress(byte[] input)
{
int inputPtr = 0;
byte ReadByte() => input[inputPtr++];
int ReadVLC(int seed = 0)
{
var result = seed >> 1;
while (seed % 2 == 0)
result = (result << 7) | ((seed = ReadByte()) >> 1);
return result;
}
(int lo, int hi) GetNibbles(byte b) => (b & 0xF, b >> 4);
var (filesize, unk1, unk2) = (ReadVLC(), ReadVLC(), ReadVLC());
var buffer = new List<byte>();
while (buffer.Count < filesize)
{
// literals
var (size, copies) = GetNibbles(ReadByte());
if (size == 0) size = ReadVLC();
if (copies == 0) copies = ReadVLC();
buffer.AddRange(Enumerable.Range(0, size).Select(_ => ReadByte()));
// copies
while (copies-- > 0)
{
var (offset, length) = GetNibbles(ReadByte());
offset = ReadVLC(offset);
if (length == 0) length = ReadVLC();
buffer.AddRange(Enumerable.Range(0, length + 1).Select(_ => buffer[buffer.Count - offset - 1]));
}
}
return buffer.ToArray();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment