Created
January 23, 2022 21:41
-
-
Save joshiraez/b8af2cb508d3d2d765b4fc222e6144d4 to your computer and use it in GitHub Desktop.
Toy compresser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.company; | |
import java.util.*; | |
import java.util.stream.Collectors; | |
import java.util.stream.IntStream; | |
record SymbolPair(Short symbol1, Short symbol2) {} | |
record Compressed(List<Character> symbolToText, List<SymbolPair> symbolDictionary, List<Short> compressed) {} | |
//Ideally, you can save the compressed in separate binary files with a given word size to later deserialize every value, | |
//for maximum storage efficiency. | |
public class Main { | |
public static void main(String[] args) { | |
//Attempt to create an uncompresser without any prior knowledge on how compressers are done other than | |
//maybe some article in the past or so. | |
//The idea is to test myself how far can I get a working, naive skeleton of a problem on an unexplored space. | |
//As such, you can expect a really naive approach to the problem, leaving optimizations as other layers to | |
//further explore the problem space in the future. Also leaves a correct, more modern implementation | |
//as a future practice or an idea for another toy. | |
//This compressor doesn't have memory loss and right now only works for Strings. Is also very naive at | |
//symbol swapping so it might be able to be really optimized. | |
//Ideally it should work if you change what symbol you are using, as well as just put longs for the symbols | |
//and then trim to the closer power of 2 value. | |
//The compressed record can be stored and retrieved as a binary from files as the word length is known. | |
//Having a variable symbol size would need the compressed struct to also include size of the symbol word. | |
//Input | |
var toCompress = "These are short, famous texts in English from classic sources like the Bible or Shakespeare. Some texts have word definitions and explanations to help you. Some of these texts are written in an old style of English. Try to understand them, because the English that we speak today is based on what our great, great, great, great grandparents spoke before! Of course, not all these texts were originally written in English. The Bible, for example, is a translation. But they are all well known in English today, and many of them express beautiful thoughts. "; | |
var compressed = compress(toCompress); | |
System.out.println(toCompress); | |
System.out.println(toCompress.length()); | |
System.out.println(toCompress.length()*4); | |
System.out.println("--Compressed--"); | |
System.out.println(compressed.compressed()); | |
System.out.println(compressed.compressed().size()); | |
System.out.println(compressed.compressed().size()*2); | |
System.out.println(compressed.symbolToText()); | |
System.out.println(compressed.symbolToText().size()); | |
System.out.println(compressed.symbolToText().size()*4); | |
System.out.println(compressed.symbolDictionary()); | |
System.out.println(compressed.symbolDictionary().size()); | |
System.out.println(compressed.symbolDictionary().size()*2); | |
System.out.println("The compressed is "+calculateSave(toCompress, compressed)+"% of the original"); | |
//Uncompressing | |
System.out.println("DECOMPRESSING"); | |
System.out.println(uncompress(compressed)); | |
System.out.println(toCompress + " - ORIGINAL"); | |
} | |
private static float calculateSave(String toCompress, Compressed compressed) { | |
System.out.println(); | |
return (float)(compressed.compressed().size() * 2 + compressed.symbolDictionary().size() * 2 + compressed.symbolToText().size() * 4) * 100 / (toCompress.length() * 4); | |
} | |
private static String uncompress(Compressed compressed) { | |
//For uncompressing, we only have to transform symbols back until all the symbols have less than the size | |
//of symbol to text | |
List<Short> uncompressingSymbolList = new LinkedList<>(compressed.compressed()); | |
var iterator = uncompressingSymbolList.listIterator(); | |
var i = 0; | |
while(i < uncompressingSymbolList.size()) { | |
var currSymbol = uncompressingSymbolList.get(i); | |
//Keep unrolling symbol why it corresponds to another symbol | |
while(currSymbol.compareTo((short)compressed.symbolToText().size()) >= 0) { | |
var associatedSymbolPair = compressed.symbolDictionary().get(currSymbol - compressed.symbolToText().size()); | |
uncompressingSymbolList.set(i, associatedSymbolPair.symbol1()); | |
uncompressingSymbolList.add(i+1, associatedSymbolPair.symbol2()); | |
currSymbol = associatedSymbolPair.symbol1(); | |
} | |
i++; | |
} | |
return uncompressingSymbolList.stream() | |
.map( | |
symbol -> compressed.symbolToText().get(symbol) | |
) | |
.map(String::valueOf) | |
.collect(Collectors.joining()); | |
} | |
private static boolean canBeUncompressed(List<Short> uncompressingSymbolList, Compressed compressed) { | |
return uncompressingSymbolList.stream() | |
.anyMatch(symbol -> symbol.compareTo((short)compressed.symbolToText().size()) >= 0); | |
} | |
public static Compressed compress(String toCompress) { | |
//Output | |
final List<Character> symbolToText = new ArrayList<>(); | |
final List<SymbolPair> symbolDictionary = new ArrayList<>(); | |
//Convert to symbol List | |
final List<Short> baseCompression = toCompress.chars() | |
.mapToObj( | |
currChar -> (char)currChar | |
).peek( | |
currChar -> { | |
if (!symbolToText.contains(currChar)) symbolToText.add((currChar)); | |
} | |
).map( | |
currChar -> (short)symbolToText.indexOf(currChar) | |
).collect(Collectors.toCollection(ArrayList::new)); | |
//Convert to list of SymbolPair | |
List<SymbolPair> symbolPairingsList = IntStream.range(0, baseCompression.size()-1) | |
.mapToObj(i -> new SymbolPair(baseCompression.get(i), baseCompression.get(i+1))) | |
.collect(Collectors.toCollection(ArrayList::new)); | |
//Get count to avoid recalculate | |
var symbolPairCount = symbolPairingsList | |
.stream() | |
.collect(Collectors.groupingBy( | |
pair -> pair, | |
Collectors.counting() | |
)); | |
while(shouldCompressAgain(symbolPairCount)) { | |
var maxPair = findMaxPair(symbolPairCount); | |
//Add new entry to the dictionary | |
var newSymbol = (short)(symbolToText.size() + symbolDictionary.size()); | |
symbolDictionary.add(maxPair); | |
// [0,7],[7,0],... | |
//Find the pair and mutate simblings to new pairs | |
//Do in reverse so deletions don't mess the iterator | |
for (int i = symbolPairingsList.size() - 1; i >= 0; i--) { | |
if(!symbolPairingsList.get(i).equals(maxPair)) continue; | |
if (i != 0) { | |
var pastSymbolPair = symbolPairingsList.get(i-1); | |
var newSymbolPair = new SymbolPair(pastSymbolPair.symbol1(), newSymbol); | |
symbolPairingsList.set(i-1, newSymbolPair); | |
} | |
if (i != symbolPairingsList.size() - 1) { | |
var pastSymbolPair = symbolPairingsList.get(i+1); | |
var newSymbolPair = new SymbolPair(newSymbol, pastSymbolPair.symbol2()); | |
symbolPairingsList.set(i+1, newSymbolPair); | |
} | |
} | |
//Finally remove all pairings that has been replaced with the new symbol | |
symbolPairingsList = symbolPairingsList.stream().filter(pair -> !pair.equals(maxPair)).collect(Collectors.toCollection(ArrayList::new)); | |
//Recalculate symbol pair count | |
symbolPairCount = symbolPairingsList | |
.stream() | |
.collect(Collectors.groupingBy( | |
pair -> pair, | |
Collectors.counting() | |
)); | |
} | |
//Compress back to List of symbols. | |
var finalCompressed = symbolPairingsList.stream().map(SymbolPair::symbol1).collect(Collectors.toCollection(ArrayList::new)); | |
finalCompressed.add(symbolPairingsList.get(symbolPairingsList.size()-1).symbol2()); | |
return new Compressed(symbolToText, symbolDictionary, finalCompressed); | |
} | |
private static SymbolPair findMaxPair(Map<SymbolPair, Long> symbolPairCount) { | |
return symbolPairCount | |
.entrySet() | |
.stream().max(Comparator.comparingLong(Map.Entry::getValue)) | |
.get() | |
.getKey(); | |
} | |
private static boolean shouldCompressAgain(Map<SymbolPair, Long> symbolPairCount) { | |
return symbolPairCount | |
.entrySet() | |
.stream() | |
.anyMatch(entry -> entry.getValue() > 1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment