Skip to content

Instantly share code, notes, and snippets.

@pansapiens
Created November 9, 2023 03:53
Show Gist options
  • Save pansapiens/4b131626cf2df62c357c1f4744f33ab4 to your computer and use it in GitHub Desktop.
Save pansapiens/4b131626cf2df62c357c1f4744f33ab4 to your computer and use it in GitHub Desktop.
Gzip compression heatmap

Gzip compression heatmap

Generates a heatmap plot showing the compression ratio of different block through a file.

Show interesting patterns in FASTQ files, and may be useful for diagnosing pathological or unusual data. I find using the zscore transformation for the plot is more informative.

Run like:

./compression_heatmap.py -b 1048576 -c rainbow -t zscore SRR11794587_2.fastq.gz
#!/usr/bin/env python
import argparse
import gzip
import numpy as np
import matplotlib.pyplot as plt
import os
def compress_block(block: bytes) -> float:
compressed = gzip.compress(block)
ratio = (
len(block) / len(compressed) if compressed else float("inf")
) # Prevent division by zero
return ratio
def is_gzipped(file_path: str) -> bool:
with open(file_path, "rb") as f:
return f.read(2) == b"\x1f\x8b"
def calculate_compression_ratios(file_path: str, block_size: int) -> np.ndarray:
compression_ratios = []
open_func = gzip.open if is_gzipped(file_path) else open
with open_func(file_path, "rb") as f:
while True:
block = f.read(block_size)
if not block:
break
ratio = compress_block(block)
compression_ratios.append(ratio)
# Convert the list of ratios to a 1D array
ratios_array = np.array(compression_ratios)
return ratios_array
def log_transform_ratios(ratios_array: np.ndarray) -> np.ndarray:
# Add a small constant to avoid log(0)
small_constant = 1e-5
adjusted_ratios = (
ratios_array + small_constant - min(ratios_array) + 1
) # Ensure all values are positive
log_ratios = np.log(adjusted_ratios)
return log_ratios
def z_score_standardize_ratios(ratios_array: np.ndarray) -> np.ndarray:
mean = np.mean(ratios_array)
std = np.std(ratios_array)
standardized_ratios = (ratios_array - mean) / std
return standardized_ratios
def generate_heatmap(
ratios_array: np.ndarray, output_image: str, cmap: str, transform: str = None
) -> None:
plt.figure(figsize=(10, 5))
plt.imshow(ratios_array.reshape(1, -1), cmap=cmap, aspect="auto")
plt.colorbar()
plt.title("Heatmap of Compression Ratios")
plt.xlabel("Block Number")
ylabel = "Compression Ratio"
if transform == "log":
ylabel = "Log of Compression Ratio"
elif transform == "zscore":
ylabel = "Z-Score of Compression Ratio"
plt.ylabel(ylabel)
plt.savefig(output_image)
plt.close()
def save_ratios_tsv(
ratios_array: np.ndarray, output_filename: str, block_size: int
) -> None:
tsv_file_name = os.path.join(
os.getcwd(), output_filename + "_compression_ratios.tsv.gz"
)
# Save the compression ratios as a gzip compressed TSV file
with gzip.open(tsv_file_name, "wt") as f:
f.write(f"#block_size={block_size}\n")
for i, ratio in enumerate(ratios_array):
f.write(f"{i}\t{ratio}\n")
print(f"Compression ratios saved as: {tsv_file_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate a heatmap and TSV of compression ratios for a file."
)
parser.add_argument("file", type=str, help="File to compress and analyze")
parser.add_argument(
"-o",
"--output",
type=str,
help="Output image filename (PNG format)",
default=None,
)
parser.add_argument(
"-b",
"--block_size",
type=int,
default=1024,
help="Block size for compression (default: 1024)",
)
parser.add_argument(
"-w",
"--window_size",
type=int,
default=1,
help="Window size to average compression ratios (default: 1)",
)
parser.add_argument(
"-c",
"--cmap",
type=str,
default="rainbow",
help="Matplotlib colormap for the heatmap (default: rainbow)",
)
parser.add_argument(
"-t",
"--transform",
type=str,
default=None,
help='Apply a transformation to the ratios (options: "log", "zscore")',
)
args = parser.parse_args()
file_path = args.file
block_size = args.block_size
window_size = args.window_size
cmap = args.cmap
base_filename = os.path.splitext(os.path.basename(file_path))[0]
if args.output:
output_image = args.output
else:
output_image = (
f"{os.path.splitext(base_filename)[0]}_compression_ratio_heatmap.png"
)
ratios_array = calculate_compression_ratios(file_path, block_size)
output_tsv_filename = (
base_filename
if args.output is None
else os.path.splitext(os.path.basename(args.output))[0]
)
save_ratios_tsv(ratios_array, output_tsv_filename, block_size)
if args.transform == "log":
ratios_array = log_transform_ratios(ratios_array)
elif args.transform == "zscore":
ratios_array = z_score_standardize_ratios(ratios_array)
if window_size > 1:
ratios_array = np.convolve(
ratios_array, np.ones(window_size) / window_size, mode="valid"
)
generate_heatmap(ratios_array, output_image, cmap, args.transform)
print(f"Heatmap saved as: {output_image}")
@pansapiens
Copy link
Author

Example:

SRR11794587_1

SRR11794587_1_compression_ratio_heatmap

SRR11794587_2

SRR11794587_2_compression_ratio_heatmap

@pansapiens
Copy link
Author

Consider also exploring the impact of bgzip (eg via https://github.com/DataBiosphere/bgzip ).

@pansapiens
Copy link
Author

It might make more sense for the default block size to be 32000, matching the default gzip block size. For gzip, it probably doesn't make sense for the block size to be >32000 (since blocks are compressed independently and this isn't configurable in the Python stdlib gzip).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment