Skip to content

Instantly share code, notes, and snippets.

@Toparvion
Last active November 26, 2023 11:08
Show Gist options
  • Save Toparvion/8c79ef0553caf0f3108a4bdbad6bb6d0 to your computer and use it in GitHub Desktop.
Save Toparvion/8c79ef0553caf0f3108a4bdbad6bb6d0 to your computer and use it in GitHub Desktop.
A simple benchmark for comparing compression ratios of various compressing algorithms applied to a natural text
package pro.toparvion.stegotext.compress;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.compress.utils.IOUtils;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.TestInstance;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static java.nio.file.StandardOpenOption.*;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS;
/**
* A simple benchmark for comparing compression ratios of various compressing algorithms applied to a natural text
*
* @author Toparvion
*/
@TestInstance(PER_CLASS)
public class CompressTest {
private static final Logger log = LoggerFactory.getLogger(CompressTest.class);
private static final List<Integer> INPUT_SIZES =
List.of(20, 50, 100, 200, 300, 500, 1000, 2000, 5000, 10_000, 50_000, 100_000);
private static final CompressorStreamFactory COMPRESSOR_FACTORY = new CompressorStreamFactory();
private static final Path SOURCE_TEXT_PATH = Path.of("sandbox/doyle-return-388.txt");
// private static final Path SOURCE_TEXT_PATH = Path.of("sandbox/pepko.txt");
private final List<List<String>> csvOut = new ArrayList<>();
private byte[] textSampleBytes;
@BeforeAll
void beforeAll() throws IOException {
List<String> header = new ArrayList<>(INPUT_SIZES.size() + 1);
header.add("Algo");
INPUT_SIZES.stream()
.map(String::valueOf)
.forEach(header::add);
csvOut.add(header);
textSampleBytes = Files.readAllBytes(SOURCE_TEXT_PATH);
}
@AfterAll
void afterAll() throws IOException {
List<String> csvLines = csvOut.stream()
.map(line -> String.join(",", line))
.toList();
Path csvFilePath = Path.of("compress.csv");
Files.write(csvFilePath, csvLines, CREATE, WRITE, TRUNCATE_EXISTING);
log.info("Written {} lines to '{}'", csvLines.size(), csvFilePath);
}
@ParameterizedTest(name = "Algorithm: {0}")
@ValueSource(strings = {
// CompressorStreamFactory.BROTLI, // read-only
CompressorStreamFactory.BZIP2,
CompressorStreamFactory.DEFLATE,
CompressorStreamFactory.GZIP,
CompressorStreamFactory.LZMA,
CompressorStreamFactory.LZ4_BLOCK,
CompressorStreamFactory.LZ4_FRAMED,
CompressorStreamFactory.SNAPPY_FRAMED,
// CompressorStreamFactory.SNAPPY_RAW, // Compressor: snappy-raw not found.
CompressorStreamFactory.XZ,
// CompressorStreamFactory.Z, // read-only
CompressorStreamFactory.ZSTANDARD
})
@DisplayName("Compression test suite for Apache Commons Compress")
void testCompression(String algo) throws CompressorException, IOException {
List<String> csvLine = new ArrayList<>(INPUT_SIZES.size() + 1);
csvLine.add(algo);
for (int inputSize : INPUT_SIZES) {
// given
byte[] sourceBytes = Arrays.copyOfRange(textSampleBytes, 31_810, (31_810 + inputSize));
// when
var compressedSourceBytes = compress(sourceBytes, algo);
var decompressedSourceBytes = decompress(compressedSourceBytes, algo);
//then
int sourceLength = sourceBytes.length;
int resultLength = compressedSourceBytes.length;
double delta = ((sourceLength - resultLength) / (double) sourceLength) * 100.00;
log.info("Algo: {}, source size: {}, compressed size: {}, delta: {}", algo, sourceLength, resultLength, delta);
assertArrayEquals(sourceBytes, decompressedSourceBytes);
csvLine.add(String.valueOf(resultLength));
}
csvOut.add(csvLine);
}
private byte[] compress(byte[] sourceBytes, String algo) throws CompressorException, IOException {
var inStream = new ByteArrayInputStream(sourceBytes);
var outStream = new ByteArrayOutputStream();
var bufOutStream = new BufferedOutputStream(outStream);
try (var compressStream = COMPRESSOR_FACTORY.createCompressorOutputStream(algo, bufOutStream)) {
IOUtils.copy(inStream, compressStream);
}
return outStream.toByteArray();
}
private byte[] decompress(byte[] compressed, String algo) throws CompressorException, IOException {
var inStream = new BufferedInputStream(new ByteArrayInputStream(compressed));
var outStream = new ByteArrayOutputStream();
try (var compressStream = COMPRESSOR_FACTORY.createCompressorInputStream(algo, inStream)) {
IOUtils.copy(compressStream, outStream);
}
return outStream.toByteArray();
}
}
@Toparvion
Copy link
Author

Dependencies

In Gradle Kotlin DSL format:

    testImplementation("org.apache.commons:commons-compress:1.25.0")
    testImplementation("org.tukaani:xz:1.9")
    testImplementation("org.xerial.snappy:snappy-java:1.1.10.5")
    testImplementation("com.github.luben:zstd-jni:1.5.5-10")

Input

  • A text file in sandbox directory (see SOURCE_TEXT_PATH constant)

Output

  • The compress.csv file with all the gathered data (the lengths of result byte arrays)

Sample output

The CSV output may look like

Algo,20,50,100,200,300,500,1000,2000,5000,10000,50000,100000
bzip2,57,82,116,179,241,344,586,1005,2351,4297,17743,32567
deflate,28,56,89,149,205,312,562,1014,2464,4660,20516,39205
gz,40,68,101,161,217,324,574,1026,2476,4672,20528,39217
lzma,44,71,112,187,255,377,651,1112,2526,4624,19281,35895
lz4-block,22,52,102,201,298,469,875,1564,3665,6665,26667,48509
lz4-framed,39,69,119,219,317,488,894,1583,3684,6684,26686,48522
snappy-framed,40,69,118,217,309,475,865,1529,3675,6812,28066,55491
xz,76,108,152,232,300,424,696,1160,2572,4668,19328,35940
zstd,29,59,89,147,215,322,580,1044,2509,4727,20981,39989

It can be converted to compression ratios the way the delta variable is computed:

double delta = ((sourceLength - resultLength) / (double) sourceLength) * 100.00;

This allows the results to be presented in a more readable way, e.g. as a table:

image

or as a chart:

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment