Skip to content

Instantly share code, notes, and snippets.

@fedarko
Last active December 17, 2023 22:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fedarko/0af5d41193abfbec168ce5e643fdc9d5 to your computer and use it in GitHub Desktop.
Save fedarko/0af5d41193abfbec168ce5e643fdc9d5 to your computer and use it in GitHub Desktop.
Compute simple statistics (number of reads, total read length, average read length) for a set of (maybe gzipped) FASTA / FASTQ files
#! /usr/bin/env python3
#
# Computes the total number of reads, total read length, and average read
# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
# library (https://github.com/lmdu/pyfastx). I designed this in the context of
# computing read statistics, but if you have a set of other sequences (e.g.
# contigs) then I guess this would still work for that.
#
# USAGE:
# ./read_stats.py file1.fa [file2.fa ...]
#
# Each input file's suffix should match one of the suffixes in FASTA_SUFFIXES
# or FASTQ_SUFFIXES defined below, optionally with a ".gz" added to the end
# (e.g. "file1.fastq.gz" is fine). Also, you can use globbing (e.g. "*.fa") and
# that should work also.
import sys
import time
import glob
import pyfastx
FASTA_SUFFIXES = ["fa", "fasta", "fna"]
FASTQ_SUFFIXES = ["fq", "fastq"]
t0 = time.time()
def _log(msg):
print(f"{time.time() - t0:,.2f}s. {msg}")
_log("Starting...")
if len(sys.argv) > 1:
ir_paths = sys.argv[1:]
else:
raise ValueError("We need at least 1 arg (FASTA/FASTQ filepath)")
num_files_seen = 0
total_num_reads = 0
total_read_length = 0
# Go through each general path -- this could be a single file, or a reference
# to multiple files (e.g. "myfiles/*.fastq.gz", which could correspond to
# arbitrarily many files)
for irp in ir_paths:
# Use glob.glob() to convert these to lists of real file paths.
# This is of course vulnerable to a race condition if some antagonist
# deletes a file in between calling glob.glob() here and when we attempt to
# parse it below. But that should trigger an error, and ... look, this is a
# script for counting read lengths for a single paper. It should be fine.
for irf in glob.glob(irp):
tf0 = time.time()
num_files_seen += 1
print("-" * 79)
_log(f"On file #{num_files_seen:,} ({irf})...")
read_kwargs = {}
irf_lower = irf.lower()
fconst = None
for s in FASTA_SUFFIXES:
if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
fconst = pyfastx.Fasta
break
if fconst is None:
for s in FASTQ_SUFFIXES:
if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
fconst = pyfastx.Fastq
break
if fconst is None:
raise ValueError(f"Not a FASTA/FASTQ file?: {irf}")
f = fconst(irf)
file_num_reads = len(f)
file_read_length = f.size
tf1 = time.time()
_log(
f"Done processing file #{num_files_seen}; took {tf1 - tf0:,.2f}s."
)
_log(
f"File had {file_num_reads:,} reads of total length "
f"{file_read_length:,}."
)
total_num_reads += file_num_reads
total_read_length += file_read_length
avg_read_length = total_read_length / total_num_reads
t1 = time.time()
print("=" * 79)
_log(f"Done: processed {num_files_seen:,} files.")
_log(f"Total time taken: {t1 - t0:,.2f} sec.")
_log(f"Total number of reads: {total_num_reads:,}")
_log(f"Total read length: {total_read_length:,}")
_log(f"Average read length: {avg_read_length:,}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment