Created
February 7, 2020 21:51
-
-
Save AdibSurani/cab4aa7fb16a4924e6f174eb0ea021a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var bytes = File.ReadAllBytes("vt1.file1.proper.bin"); | |
$"Uncompressed size is {bytes.Length}".Dump(); | |
var compressed = Compress(bytes); | |
$"Compressed size is {compressed.Length} ({100.0 * compressed.Length / bytes.Length:0.0}%)".Dump(); | |
$"Round-trip decompression succeeded: {Decompress(compressed).SequenceEqual(bytes)}".Dump(); | |
// run with LINQPad 6 | |
/////////////////////// | |
byte[] Compress(byte[] input) | |
{ | |
// prelude | |
if (input.Length < 2) throw new ArgumentException("Input needs to be at least 2 bytes"); | |
var thresholds = new[]{4, 7, 14, 21, 28}.Select(n => (1 << n) - 1).ToList(); | |
var tail2 = new int[input.Length]; | |
var tail3 = new int[input.Length]; | |
tail2[input.Length - 1] = tail3[input.Length - 1] = tail3[input.Length - 2] = -1; | |
var nodes = new (int score, bool isMatchStage, int currentRun, int parent)[input.Length + 1]; | |
nodes[0].isMatchStage = true; | |
// populate two-byte and three-byte history | |
var head2 = new int[1 << 16]; | |
for (int i = 0; i < head2.Length; i++) head2[i] = -1; | |
var head3 = new int[1 << 24]; | |
for (int i = 0; i < head3.Length; i++) head3[i] = -1; | |
for (int i = 0; i < input.Length - 1; i++) | |
{ | |
var hash = input[i] | (input[i + 1] << 8); | |
(tail2[i], head2[hash]) = (head2[hash], i); | |
if (i < input.Length - 2) | |
{ | |
hash |= input[i + 2] << 16; | |
(tail3[i], head3[hash]) = (head3[hash], i); | |
} | |
} | |
// find all possible matches | |
var matches = Enumerable.Range(0, input.Length).AsParallel().AsOrdered().Select(pos => FindMatches(pos).ToList()).ToList(); | |
IEnumerable<(int length, int offset)> FindMatches(int pos) | |
{ | |
if (tail2[pos] != tail3[pos]) yield return (2, pos - tail2[pos]); | |
for (int i = tail3[pos], bestLength = 2; i >= 0; i = tail3[i]) | |
{ | |
if (input[i + bestLength] != input[pos + bestLength]) continue; | |
int j = 2; | |
while (++j < input.Length - pos && input[i + j] == input[pos + j]) ; | |
if (j > bestLength) | |
{ | |
bestLength = j; | |
yield return (j, pos - i); | |
if (pos + j >= input.Length) yield break; | |
} | |
} | |
} | |
// auxiliary helper functions for finding an optimal path | |
int BitLength(int n) | |
{ | |
int c = 1; | |
while ((n >>= 1) != 0) c++; | |
return c; | |
} | |
int GetWeight(int length, int offset) | |
{ | |
var lengthExtra = length <= 16 ? 0 : (BitLength(length - 1) + 6) / 7; | |
var offsetExtra = (BitLength(offset - 1) + 3) / 7; | |
return 1 + lengthExtra + offsetExtra; | |
} | |
// psuedo-optimal path finder, using a 4-variable heuristic | |
for (int i = 0; i < input.Length; i++) | |
{ | |
var curr = nodes[i]; | |
void Update(int length, int weight, bool matchStage) | |
{ | |
var newNode = (curr.score + weight, matchStage, curr.isMatchStage == matchStage ? curr.currentRun + 1 : 1, i); | |
ref var node = ref nodes[i + length]; | |
if (node.score == 0 || newNode.CompareTo(node) < 0) node = newNode; | |
} | |
var (arr, atThreshold) = (matches[i], thresholds.Contains(curr.currentRun)); | |
Update(1, curr.isMatchStage || atThreshold ? 2 : 1, false); | |
if (arr.Any()) | |
for (int j = arr.Last().length, k = arr.Count - 1; j >= 2; j--, j = j > 32 ? j / 2 : j) | |
{ | |
while (k > 0 && arr[k - 1].length >= j) k--; | |
Update(j, GetWeight(j, arr[k].offset) + (curr.isMatchStage && atThreshold ? 1 : 0), true); | |
} | |
} | |
// more auxiliary helper functions, this time for constructing the byte sequence | |
byte[] VLCBytes(int n) | |
{ | |
var tmp = new Stack<byte>(); | |
tmp.Push((byte)(n << 1 | 1)); | |
n >>= 7; | |
while (n != 0) | |
{ | |
tmp.Push((byte)(n << 1)); | |
n >>= 7; | |
} | |
return tmp.ToArray(); | |
} | |
byte[] MakeLiteralCopies(int literal, int copies) | |
{ | |
int litNibble = 0, copNibble = 0; | |
byte[] litExtra = new byte[0], copExtra = new byte[0]; | |
if (literal < 16) litNibble = literal; | |
else litExtra = VLCBytes(literal); | |
if (copies < 16) copNibble = copies; | |
else copExtra = VLCBytes(copies); | |
if (copies == 0) copExtra = VLCBytes(copies); // special case where last byte is literal | |
return new[]{(byte)(litNibble | (copNibble << 4))}.Concat(litExtra).Concat(copExtra).ToArray(); | |
} | |
byte[] MakeLengthOffset(int length, int offset) | |
{ | |
(length, offset) = (length - 1, offset - 1); | |
int lenNibble = 0, offNibble = 0; | |
byte[] lenExtra = new byte[0], offExtra = new byte[0]; | |
if (length < 16) lenNibble = length; | |
else lenExtra = VLCBytes(length); | |
if (offset < 8) offNibble = offset << 1 | 1; | |
else | |
{ | |
var totalBits = BitLength(offset); | |
var extraSize = (totalBits + 3) / 7; | |
var bitsToShift = 7 * extraSize; | |
var high = offset >> bitsToShift; | |
var low = offset & ((1 << bitsToShift) - 1); | |
offNibble = high << 1; | |
offExtra = VLCBytes(low | 127 << bitsToShift).Skip(1).ToArray(); | |
} | |
return new[]{(byte)(offNibble | (lenNibble << 4))}.Concat(offExtra).Concat(lenExtra).ToArray(); | |
} | |
// actually constructing the byte sequence | |
var stack = new Stack<byte[]>(); | |
for (int n = input.Length, matchRun = 0; n > 0;) | |
{ | |
var node = nodes[n]; | |
if (node.isMatchStage) | |
{ | |
var length = n - node.parent; | |
n = node.parent; | |
stack.Push(MakeLengthOffset(length, matches[n].First(x => x.length >= length).offset)); | |
matchRun++; | |
} | |
else | |
{ | |
n -= node.currentRun; | |
stack.Push(input[n..(n + node.currentRun)]); | |
stack.Push(MakeLiteralCopies(node.currentRun, matchRun)); | |
matchRun = 0; | |
} | |
} | |
return new[]{input.Length, 25, 0}.Select(VLCBytes).Concat(stack).SelectMany(x => x).ToArray(); | |
} | |
byte[] Decompress(byte[] input) | |
{ | |
int inputPtr = 0; | |
byte ReadByte() => input[inputPtr++]; | |
int ReadVLC(int seed = 0) | |
{ | |
var result = seed >> 1; | |
while (seed % 2 == 0) | |
result = (result << 7) | ((seed = ReadByte()) >> 1); | |
return result; | |
} | |
(int lo, int hi) GetNibbles(byte b) => (b & 0xF, b >> 4); | |
var (filesize, unk1, unk2) = (ReadVLC(), ReadVLC(), ReadVLC()); | |
var buffer = new List<byte>(); | |
while (buffer.Count < filesize) | |
{ | |
// literals | |
var (size, copies) = GetNibbles(ReadByte()); | |
if (size == 0) size = ReadVLC(); | |
if (copies == 0) copies = ReadVLC(); | |
buffer.AddRange(Enumerable.Range(0, size).Select(_ => ReadByte())); | |
// copies | |
while (copies-- > 0) | |
{ | |
var (offset, length) = GetNibbles(ReadByte()); | |
offset = ReadVLC(offset); | |
if (length == 0) length = ReadVLC(); | |
buffer.AddRange(Enumerable.Range(0, length + 1).Select(_ => buffer[buffer.Count - offset - 1])); | |
} | |
} | |
return buffer.ToArray(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment