Skip to content

Instantly share code, notes, and snippets.

@joshiraez
Created January 23, 2022 21:41
Show Gist options
  • Save joshiraez/b8af2cb508d3d2d765b4fc222e6144d4 to your computer and use it in GitHub Desktop.
Save joshiraez/b8af2cb508d3d2d765b4fc222e6144d4 to your computer and use it in GitHub Desktop.
Toy compresser
package com.company;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
record SymbolPair(Short symbol1, Short symbol2) {}
record Compressed(List<Character> symbolToText, List<SymbolPair> symbolDictionary, List<Short> compressed) {}
//Ideally, you can save the compressed in separate binary files with a given word size to later deserialize every value,
//for maximum storage efficiency.
public class Main {
public static void main(String[] args) {
//Attempt to create an uncompresser without any prior knowledge on how compressers are done other than
//maybe some article in the past or so.
//The idea is to test myself how far can I get a working, naive skeleton of a problem on an unexplored space.
//As such, you can expect a really naive approach to the problem, leaving optimizations as other layers to
//further explore the problem space in the future. Also leaves a correct, more modern implementation
//as a future practice or an idea for another toy.
//This compressor doesn't have memory loss and right now only works for Strings. Is also very naive at
//symbol swapping so it might be able to be really optimized.
//Ideally it should work if you change what symbol you are using, as well as just put longs for the symbols
//and then trim to the closer power of 2 value.
//The compressed record can be stored and retrieved as a binary from files as the word length is known.
//Having a variable symbol size would need the compressed struct to also include size of the symbol word.
//Input
var toCompress = "These are short, famous texts in English from classic sources like the Bible or Shakespeare. Some texts have word definitions and explanations to help you. Some of these texts are written in an old style of English. Try to understand them, because the English that we speak today is based on what our great, great, great, great grandparents spoke before! Of course, not all these texts were originally written in English. The Bible, for example, is a translation. But they are all well known in English today, and many of them express beautiful thoughts. ";
var compressed = compress(toCompress);
System.out.println(toCompress);
System.out.println(toCompress.length());
System.out.println(toCompress.length()*4);
System.out.println("--Compressed--");
System.out.println(compressed.compressed());
System.out.println(compressed.compressed().size());
System.out.println(compressed.compressed().size()*2);
System.out.println(compressed.symbolToText());
System.out.println(compressed.symbolToText().size());
System.out.println(compressed.symbolToText().size()*4);
System.out.println(compressed.symbolDictionary());
System.out.println(compressed.symbolDictionary().size());
System.out.println(compressed.symbolDictionary().size()*2);
System.out.println("The compressed is "+calculateSave(toCompress, compressed)+"% of the original");
//Uncompressing
System.out.println("DECOMPRESSING");
System.out.println(uncompress(compressed));
System.out.println(toCompress + " - ORIGINAL");
}
private static float calculateSave(String toCompress, Compressed compressed) {
System.out.println();
return (float)(compressed.compressed().size() * 2 + compressed.symbolDictionary().size() * 2 + compressed.symbolToText().size() * 4) * 100 / (toCompress.length() * 4);
}
private static String uncompress(Compressed compressed) {
//For uncompressing, we only have to transform symbols back until all the symbols have less than the size
//of symbol to text
List<Short> uncompressingSymbolList = new LinkedList<>(compressed.compressed());
var iterator = uncompressingSymbolList.listIterator();
var i = 0;
while(i < uncompressingSymbolList.size()) {
var currSymbol = uncompressingSymbolList.get(i);
//Keep unrolling symbol why it corresponds to another symbol
while(currSymbol.compareTo((short)compressed.symbolToText().size()) >= 0) {
var associatedSymbolPair = compressed.symbolDictionary().get(currSymbol - compressed.symbolToText().size());
uncompressingSymbolList.set(i, associatedSymbolPair.symbol1());
uncompressingSymbolList.add(i+1, associatedSymbolPair.symbol2());
currSymbol = associatedSymbolPair.symbol1();
}
i++;
}
return uncompressingSymbolList.stream()
.map(
symbol -> compressed.symbolToText().get(symbol)
)
.map(String::valueOf)
.collect(Collectors.joining());
}
private static boolean canBeUncompressed(List<Short> uncompressingSymbolList, Compressed compressed) {
return uncompressingSymbolList.stream()
.anyMatch(symbol -> symbol.compareTo((short)compressed.symbolToText().size()) >= 0);
}
public static Compressed compress(String toCompress) {
//Output
final List<Character> symbolToText = new ArrayList<>();
final List<SymbolPair> symbolDictionary = new ArrayList<>();
//Convert to symbol List
final List<Short> baseCompression = toCompress.chars()
.mapToObj(
currChar -> (char)currChar
).peek(
currChar -> {
if (!symbolToText.contains(currChar)) symbolToText.add((currChar));
}
).map(
currChar -> (short)symbolToText.indexOf(currChar)
).collect(Collectors.toCollection(ArrayList::new));
//Convert to list of SymbolPair
List<SymbolPair> symbolPairingsList = IntStream.range(0, baseCompression.size()-1)
.mapToObj(i -> new SymbolPair(baseCompression.get(i), baseCompression.get(i+1)))
.collect(Collectors.toCollection(ArrayList::new));
//Get count to avoid recalculate
var symbolPairCount = symbolPairingsList
.stream()
.collect(Collectors.groupingBy(
pair -> pair,
Collectors.counting()
));
while(shouldCompressAgain(symbolPairCount)) {
var maxPair = findMaxPair(symbolPairCount);
//Add new entry to the dictionary
var newSymbol = (short)(symbolToText.size() + symbolDictionary.size());
symbolDictionary.add(maxPair);
// [0,7],[7,0],...
//Find the pair and mutate simblings to new pairs
//Do in reverse so deletions don't mess the iterator
for (int i = symbolPairingsList.size() - 1; i >= 0; i--) {
if(!symbolPairingsList.get(i).equals(maxPair)) continue;
if (i != 0) {
var pastSymbolPair = symbolPairingsList.get(i-1);
var newSymbolPair = new SymbolPair(pastSymbolPair.symbol1(), newSymbol);
symbolPairingsList.set(i-1, newSymbolPair);
}
if (i != symbolPairingsList.size() - 1) {
var pastSymbolPair = symbolPairingsList.get(i+1);
var newSymbolPair = new SymbolPair(newSymbol, pastSymbolPair.symbol2());
symbolPairingsList.set(i+1, newSymbolPair);
}
}
//Finally remove all pairings that has been replaced with the new symbol
symbolPairingsList = symbolPairingsList.stream().filter(pair -> !pair.equals(maxPair)).collect(Collectors.toCollection(ArrayList::new));
//Recalculate symbol pair count
symbolPairCount = symbolPairingsList
.stream()
.collect(Collectors.groupingBy(
pair -> pair,
Collectors.counting()
));
}
//Compress back to List of symbols.
var finalCompressed = symbolPairingsList.stream().map(SymbolPair::symbol1).collect(Collectors.toCollection(ArrayList::new));
finalCompressed.add(symbolPairingsList.get(symbolPairingsList.size()-1).symbol2());
return new Compressed(symbolToText, symbolDictionary, finalCompressed);
}
private static SymbolPair findMaxPair(Map<SymbolPair, Long> symbolPairCount) {
return symbolPairCount
.entrySet()
.stream().max(Comparator.comparingLong(Map.Entry::getValue))
.get()
.getKey();
}
private static boolean shouldCompressAgain(Map<SymbolPair, Long> symbolPairCount) {
return symbolPairCount
.entrySet()
.stream()
.anyMatch(entry -> entry.getValue() > 1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment