Last active
October 28, 2025 12:25
-
-
Save coderodde/90905a33923ecb112afe0f75b36e7f33 to your computer and use it in GitHub Desktop.
Data Compression Techniques 2025 - Week 1 Task 6 (Shannon-Fano coding)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package io.github.coderodde.compression; | |
| import java.util.BitSet; | |
| /** | |
| * This class implements a <b>binary</b> code word in data compression | |
| * scenarios. | |
| * | |
| * @author Rodion "rodde" Efremov | |
| * @version 1.0.0 (Oct 28, 2025) | |
| * @since 1.0.0 (Oct 28, 2025) | |
| */ | |
| public class CodeWord { | |
| private final int length; | |
| private final BitSet bits; | |
| public CodeWord(final int length) { | |
| checkLength(length); | |
| this.length = length; | |
| this.bits = new BitSet(length); | |
| } | |
| public int length() { | |
| return length; | |
| } | |
| public boolean get(final int index) { | |
| checkIndex(index); | |
| return bits.get(index); | |
| } | |
| public void set(final int index) { | |
| checkIndex(index); | |
| bits.set(index); | |
| } | |
| public void unset(final int index) { | |
| checkIndex(index); | |
| bits.set(index, false); | |
| } | |
| @Override | |
| public String toString() { | |
| final StringBuilder sb = new StringBuilder(length); | |
| for (int i = 0; i < length; ++i) { | |
| sb.append(get(i) ? "1" : "0"); | |
| } | |
| return sb.toString(); | |
| } | |
| private void checkIndex(final int index) { | |
| if (index < 0) { | |
| throw new IndexOutOfBoundsException( | |
| String.format("index(%d) < 0", index)); | |
| } | |
| if (index >= this.length) { | |
| throw new IndexOutOfBoundsException( | |
| String.format("index(%d) >= length(%d)", | |
| index, | |
| length)); | |
| } | |
| } | |
| private static void checkLength(final int length) { | |
| if (length < 1) { | |
| throw new IllegalArgumentException( | |
| String.format("length(%d) < 1", length)); | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package io.github.coderodde.compression.demo; | |
| import io.github.coderodde.compression.CodeWord; | |
| import io.github.coderodde.compression.ShannonFanoEncoder; | |
| import java.util.HashMap; | |
| import java.util.Map; | |
| /** | |
| * This class runs some demonstration on Shannon-Fano coding. | |
| * | |
| * @author Rodion "rodde" Efremov | |
| * @version 1.0.0 (Oct 27, 2025) | |
| * @since 1.0.0 (Oct 27, 2025) | |
| */ | |
| final class Demo { | |
| public static void main(String[] args) { | |
| geeksForGeeksDemo(); | |
| week1Task6(); | |
| } | |
| private static void geeksForGeeksDemo() { | |
| final Map<Character, Double> probabilityDistribution = new HashMap<>(); | |
| // This probability distribution is from | |
| // https://www.geeksforgeeks.org/dsa/shannon-fano-algorithm-for-data-compression/ | |
| probabilityDistribution.put('a', 0.22); | |
| probabilityDistribution.put('b', 0.28); | |
| probabilityDistribution.put('c', 0.15); | |
| probabilityDistribution.put('d', 0.30); | |
| probabilityDistribution.put('e', 0.05); | |
| final Map<Character, CodeWord> coding = | |
| ShannonFanoEncoder.encode(probabilityDistribution); | |
| System.out.println(coding); | |
| } | |
| private static void week1Task6() { | |
| final Map<Integer, Double> probabilitDistributionA = | |
| getTask6SectionAProbabilityDistribution(); | |
| System.out.println(ShannonFanoEncoder.encode(probabilitDistributionA)); | |
| final Map<Integer, Double> probabilitDistributionB = | |
| getTask6SectionBProbabilityDistribution(); | |
| System.out.println(ShannonFanoEncoder.encode(probabilitDistributionB)); | |
| } | |
| private static Map<Integer, Double> | |
| getTask6SectionAProbabilityDistribution() { | |
| final Map<Integer, Double> probabilityDistribution = new HashMap<>(); | |
| for (int i = 0; i < 10; ++i) { | |
| probabilityDistribution.put(i, getAProbability(i)); | |
| } | |
| return probabilityDistribution; | |
| } | |
| private static Map<Integer, Double> | |
| getTask6SectionBProbabilityDistribution() { | |
| final Map<Integer, Double> probabilityDistribution = new HashMap<>(); | |
| for (int i = 0; i < 10; ++i) { | |
| probabilityDistribution.put(i, getBProbability(i)); | |
| } | |
| return probabilityDistribution; | |
| } | |
| private static double getAProbability(final int number) { | |
| final double p = Math.pow(2.0, -0.25); | |
| return (1.0 - p) * Math.pow(p, number); | |
| } | |
| private static double getBProbability(final int number) { | |
| final double zeta2 = 1.645; | |
| return Math.pow(number + 1, -2.0) / zeta2; | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package io.github.coderodde.compression; | |
| import java.util.ArrayDeque; | |
| import java.util.ArrayList; | |
| import java.util.Collections; | |
| import java.util.Deque; | |
| import java.util.HashMap; | |
| import java.util.List; | |
| import java.util.Map; | |
| import java.util.Objects; | |
| import java.util.TreeMap; | |
| /** | |
| * This class contains a static method for computing the Shannon-Fano coding. | |
| * | |
| * @author Rodion "rodde" Efremov | |
| * @version 1.0.0 (Oct 27, 2025) | |
| * @since 1.0.0 (Oct 27, 2025) | |
| */ | |
| public final class ShannonFanoEncoder { | |
| private static final double MINIMUM_EPSILON = 1E-6; | |
| private static double epsilon = MINIMUM_EPSILON; | |
| public static void setEpsilon(final double epsilon) { | |
| if (Double.isNaN(epsilon)) { | |
| throw new IllegalArgumentException("epsilon is NaN"); | |
| } | |
| ShannonFanoEncoder.epsilon = Math.max(MINIMUM_EPSILON, epsilon); | |
| } | |
| public static <S extends Comparable<S>> Map<S, CodeWord> | |
| encode(final Map<S, Double> probabilityDistribution) { | |
| final List<SymbolEntry<S>> letterDataList = | |
| new ArrayList<>(probabilityDistribution.size()); | |
| double totalProbability = 0.0; | |
| for (final Map.Entry<S, Double> entry | |
| : probabilityDistribution.entrySet()) { | |
| totalProbability += Objects.requireNonNull(entry.getValue()); | |
| letterDataList.add(new SymbolEntry<>(entry.getKey(), | |
| entry.getValue())); | |
| } | |
| if (Math.abs(totalProbability - 1.0) > epsilon) { | |
| throw new IllegalArgumentException( | |
| "The input probability distribution does not sum to 1.0"); | |
| } | |
| Collections.sort(letterDataList); | |
| final Map<List<SymbolEntry<S>>, Boolean> assignmentMap = new HashMap<>(); | |
| final Map<List<SymbolEntry<S>>, List<SymbolEntry<S>>> parentsMap = | |
| new HashMap<>(); | |
| assignmentMap.put(letterDataList, null); | |
| parentsMap.put(letterDataList, null); | |
| final Deque<List<SymbolEntry<S>>> queue = new ArrayDeque<>(); | |
| queue.addLast(letterDataList); | |
| final List<List<SymbolEntry<S>>> lonelySymbols = | |
| new ArrayList<>(probabilityDistribution.size()); | |
| while (!queue.isEmpty()) { | |
| final List<SymbolEntry<S>> subList = queue.removeFirst(); | |
| final Pair<List<SymbolEntry<S>>> pair = splitEvenly(subList); | |
| final List<SymbolEntry<S>> loSublist = pair.first; | |
| final List<SymbolEntry<S>> hiSublist = pair.second; | |
| parentsMap.put(loSublist, subList); | |
| parentsMap.put(hiSublist, subList); | |
| assignmentMap.put(loSublist, Boolean.FALSE); | |
| assignmentMap.put(hiSublist, Boolean.TRUE); | |
| if (loSublist.size() > 1) { | |
| // Split later again: | |
| queue.addLast(loSublist); | |
| } else { | |
| // A singleton; cannot split: | |
| lonelySymbols.add(loSublist); | |
| } | |
| if (hiSublist.size() > 1) { | |
| // Split later again: | |
| queue.addLast(hiSublist); | |
| } else { | |
| // A singleton; cannot split: | |
| lonelySymbols.add(hiSublist); | |
| } | |
| } | |
| return inferCode(parentsMap, | |
| assignmentMap, | |
| lonelySymbols); | |
| } | |
| static <S extends Comparable<S>> Map<S, CodeWord> | |
| inferCode(final Map<List<SymbolEntry<S>>, | |
| List<SymbolEntry<S>>> parentsMap, | |
| final Map<List<SymbolEntry<S>>, Boolean> assignmentMap, | |
| final List<List<SymbolEntry<S>>> lonelySymbols) { | |
| final Map<S, CodeWord> code = new TreeMap<>(); | |
| for (final List<SymbolEntry<S>> lonelySymbol : lonelySymbols) { | |
| final CodeWord codeWord = computeCodeWord(code, | |
| lonelySymbol, | |
| parentsMap, | |
| assignmentMap); | |
| code.put(lonelySymbol.get(0).symbol, codeWord); | |
| } | |
| return code; | |
| } | |
| static <S extends Comparable<S>> | |
| CodeWord computeCodeWord(final Map<S, CodeWord> code, | |
| final List<SymbolEntry<S>> symbol, | |
| final Map<List<SymbolEntry<S>>, | |
| List<SymbolEntry<S>>> parentsMap, | |
| final Map<List<SymbolEntry<S>>, | |
| Boolean> assignmentMap) { | |
| final List<List<SymbolEntry<S>>> path = inferPath(symbol, | |
| parentsMap); | |
| final int codeWordLength = path.size() - 1; | |
| final CodeWord codeWord = new CodeWord(codeWordLength); | |
| for (int bitIndex = 1; bitIndex < path.size(); bitIndex++) { | |
| final List<SymbolEntry<S>> symbols = path.get(bitIndex); | |
| final boolean bit = assignmentMap.get(symbols); | |
| if (bit) { | |
| codeWord.set(bitIndex - 1); | |
| } | |
| } | |
| return codeWord; | |
| } | |
| static <S extends Comparable<S>> List<List<SymbolEntry<S>>> | |
| inferPath(final List<SymbolEntry<S>> symbols, | |
| final Map<List<SymbolEntry<S>>, | |
| List<SymbolEntry<S>>> parentsMap) { | |
| final List<List<SymbolEntry<S>>> path = new ArrayList<>(); | |
| for (List<SymbolEntry<S>> current = symbols; | |
| current != null; | |
| current = parentsMap.get(current)) { | |
| path.add(current); | |
| } | |
| Collections.reverse(path); | |
| return path; | |
| } | |
| static <S extends Comparable<S>> Pair<List<SymbolEntry<S>>> | |
| splitEvenly(final List<SymbolEntry<S>> list) { | |
| final List<SymbolEntry<S>> loSublist = new ArrayList<>(); | |
| final List<SymbolEntry<S>> hiSublist = new ArrayList<>(); | |
| int loIndex = 0; | |
| int hiIndex = list.size() - 1; | |
| double loProbabilitySum = 0.0; | |
| double hiProbabilitySum = 0.0; | |
| do { | |
| if (loProbabilitySum < hiProbabilitySum) { | |
| loSublist.addLast(list.get(loIndex)); | |
| loProbabilitySum += list.get(loIndex).probability; | |
| loIndex++; | |
| } else if (loProbabilitySum > hiProbabilitySum) { | |
| hiSublist.addLast(list.get(hiIndex)); | |
| hiProbabilitySum += list.get(hiIndex).probability; | |
| hiIndex--; | |
| } else { | |
| loSublist.addLast(list.get(loIndex)); | |
| hiSublist.addLast(list.get(hiIndex)); | |
| loProbabilitySum += list.get(loIndex).probability; | |
| hiProbabilitySum += list.get(hiIndex).probability; | |
| loIndex++; | |
| hiIndex--; | |
| } | |
| } while (loIndex <= hiIndex); | |
| Collections.reverse(hiSublist); | |
| return new Pair<>(loSublist, | |
| hiSublist); | |
| } | |
| static final class Pair<E> { | |
| final E first; | |
| final E second; | |
| Pair(final E first, final E second) { | |
| this.first = first; | |
| this.second = second; | |
| } | |
| } | |
| static final class SymbolEntry<S extends Comparable<S>> | |
| implements Comparable<SymbolEntry<S>> { | |
| final S symbol; | |
| final double probability; | |
| SymbolEntry(final S symbol, final double probability) { | |
| this.symbol = Objects.requireNonNull(symbol); | |
| this.probability = validateProbability(probability); | |
| } | |
| private static double validateProbability(final double probability) { | |
| if (Double.isNaN(probability)) { | |
| throw new IllegalArgumentException( | |
| "The input probability is NaN"); | |
| } | |
| if (probability <= 0.0) { | |
| throw new IllegalArgumentException("probability <= 0.0"); | |
| } | |
| if (probability > 1.0) { | |
| throw new IllegalArgumentException("probability > 1.0"); | |
| } | |
| return probability; | |
| } | |
| @Override | |
| public String toString() { | |
| return String.format("[%s: %f]", symbol.toString(), probability); | |
| } | |
| @Override | |
| public int compareTo(final SymbolEntry<S> o) { | |
| final int cmp = Double.compare(o.probability, | |
| probability); | |
| if (cmp != 0) { | |
| return cmp; | |
| } | |
| return symbol.compareTo(o.symbol); | |
| } | |
| @Override | |
| public boolean equals(final Object object) { | |
| final SymbolEntry<S> other = (SymbolEntry<S>) object; | |
| return symbol.equals(other.symbol); | |
| } | |
| @Override | |
| public int hashCode() { | |
| return symbol.hashCode(); | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package io.github.coderodde.compression; | |
| import io.github.coderodde.compression.ShannonFanoEncoder.Pair; | |
| import io.github.coderodde.compression.ShannonFanoEncoder.SymbolEntry; | |
| import java.util.ArrayList; | |
| import java.util.Collections; | |
| import java.util.HashMap; | |
| import java.util.List; | |
| import java.util.Map; | |
| import org.junit.Test; | |
| import static org.junit.Assert.*; | |
| public final class ShannonFanoEncoderTest { | |
| @Test | |
| public void splitEvenly1() { | |
| final List<SymbolEntry<Character>> list = new ArrayList<>(); | |
| final SymbolEntry<Character> a = new SymbolEntry<>('a', 0.4); | |
| final SymbolEntry<Character> b = new SymbolEntry<>('b', 0.3); | |
| final SymbolEntry<Character> c = new SymbolEntry<>('c', 0.2); | |
| final SymbolEntry<Character> d = new SymbolEntry<>('d', 0.1); | |
| list.add(c); | |
| list.add(b); | |
| list.add(d); | |
| list.add(a); | |
| Collections.sort(list); | |
| assertEquals(list.get(0), a); | |
| assertEquals(list.get(1), b); | |
| assertEquals(list.get(2), c); | |
| assertEquals(list.get(3), d); | |
| final Pair<List<SymbolEntry<Character>>> pair = | |
| ShannonFanoEncoder.splitEvenly(list); | |
| assertEquals(1, pair.first.size()); | |
| assertEquals(3, pair.second.size()); | |
| assertEquals(a, pair.first.get(0)); | |
| assertEquals(b, pair.second.get(0)); | |
| assertEquals(c, pair.second.get(1)); | |
| assertEquals(d, pair.second.get(2)); | |
| } | |
| @Test | |
| public void splitEvenly2() { | |
| final List<SymbolEntry<Character>> list = new ArrayList<>(); | |
| final SymbolEntry<Character> a = new SymbolEntry<>('a', 0.22); | |
| final SymbolEntry<Character> b = new SymbolEntry<>('b', 0.28); | |
| final SymbolEntry<Character> c = new SymbolEntry<>('c', 0.15); | |
| final SymbolEntry<Character> d = new SymbolEntry<>('d', 0.30); | |
| final SymbolEntry<Character> e = new SymbolEntry<>('e', 0.05); | |
| list.add(a); | |
| list.add(b); | |
| list.add(c); | |
| list.add(d); | |
| list.add(e); | |
| Collections.sort(list); | |
| assertEquals(list.get(0), d); | |
| assertEquals(list.get(1), b); | |
| assertEquals(list.get(2), a); | |
| assertEquals(list.get(3), c); | |
| assertEquals(list.get(4), e); | |
| final Pair<List<SymbolEntry<Character>>> pair = | |
| ShannonFanoEncoder.splitEvenly(list); | |
| assertEquals(2, pair.first.size()); | |
| assertEquals(3, pair.second.size()); | |
| assertEquals(d, pair.first.get(0)); | |
| assertEquals(b, pair.first.get(1)); | |
| assertEquals(a, pair.second.get(0)); | |
| assertEquals(c, pair.second.get(1)); | |
| assertEquals(e, pair.second.get(2)); | |
| } | |
| @Test | |
| public void splitEvenlyOnTwoList() { | |
| final List<SymbolEntry<Character>> list = new ArrayList<>(); | |
| final SymbolEntry<Character> a = new SymbolEntry<>('a', 0.4); | |
| final SymbolEntry<Character> b = new SymbolEntry<>('b', 0.6); | |
| list.add(a); | |
| list.add(b); | |
| Collections.sort(list); | |
| assertEquals(list.get(0), b); | |
| assertEquals(list.get(1), a); | |
| final Pair<List<SymbolEntry<Character>>> pair = | |
| ShannonFanoEncoder.splitEvenly(list); | |
| assertEquals(1, pair.first.size()); | |
| assertEquals(1, pair.second.size()); | |
| assertEquals(b, pair.first.get(0)); | |
| assertEquals(a, pair.second.get(0)); | |
| } | |
| @Test(expected = IllegalArgumentException.class) | |
| public void throwsOnInvalidProbabilityDistributionLessThanOne() { | |
| Map<Character, Double> distribution = new HashMap<>(); | |
| distribution.put('A', 0.6); | |
| distribution.put('B', 0.3); | |
| ShannonFanoEncoder.encode(distribution); | |
| } | |
| @Test(expected = IllegalArgumentException.class) | |
| public void throwsOnInvalidProbabilityDistributionGreaterThanOne() { | |
| Map<Character, Double> distribution = new HashMap<>(); | |
| distribution.put('A', 0.6); | |
| distribution.put('B', 0.5); | |
| ShannonFanoEncoder.encode(distribution); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment