fedarko/read_stats.py

## read_stats.py
#! /usr/bin/env python3
#
# Computes the total number of reads, total read length, and average read
# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
# library (https://github.com/lmdu/pyfastx). I designed this in the context of
# computing read statistics, but if you have a set of other sequences (e.g.
# contigs) then I guess this would still work for that.
#
# USAGE:
# ./read_stats.py file1.fa [file2.fa ...]
#
# Each input file's suffix should match one of the suffixes in FASTA_SUFFIXES
# or FASTQ_SUFFIXES defined below, optionally with a ".gz" added to the end
# (e.g. "file1.fastq.gz" is fine). Also, you can use globbing (e.g. "*.fa") and
# that should work also.


import sys
import time
import glob
import pyfastx


FASTA_SUFFIXES = ["fa", "fasta", "fna"]
FASTQ_SUFFIXES = ["fq", "fastq"]


t0 = time.time()


def _log(msg):
    print(f"{time.time() - t0:,.2f}s. {msg}")


_log("Starting...")
if len(sys.argv) > 1:
    ir_paths = sys.argv[1:]
else:
    raise ValueError("We need at least 1 arg (FASTA/FASTQ filepath)")

num_files_seen = 0
total_num_reads = 0
total_read_length = 0

# Go through each general path -- this could be a single file, or a reference
# to multiple files (e.g. "myfiles/*.fastq.gz", which could correspond to
# arbitrarily many files)
for irp in ir_paths:
    # Use glob.glob() to convert these to lists of real file paths.
    # This is of course vulnerable to a race condition if some antagonist
    # deletes a file in between calling glob.glob() here and when we attempt to
    # parse it below. But that should trigger an error, and ... look, this is a
    # script for counting read lengths for a single paper. It should be fine.
    for irf in glob.glob(irp):
        tf0 = time.time()
        num_files_seen += 1
        print("-" * 79)
        _log(f"On file #{num_files_seen:,} ({irf})...")

        read_kwargs = {}
        irf_lower = irf.lower()
        fconst = None
        for s in FASTA_SUFFIXES:
            if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
                fconst = pyfastx.Fasta
                break

        if fconst is None:
            for s in FASTQ_SUFFIXES:
                if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
                    fconst = pyfastx.Fastq
                    break

            if fconst is None:
                raise ValueError(f"Not a FASTA/FASTQ file?: {irf}")

        f = fconst(irf)
        file_num_reads = len(f)
        file_read_length = f.size

        tf1 = time.time()
        _log(
            f"Done processing file #{num_files_seen}; took {tf1 - tf0:,.2f}s."
        )
        _log(
            f"File had {file_num_reads:,} reads of total length "
            f"{file_read_length:,}."
        )
        total_num_reads += file_num_reads
        total_read_length += file_read_length

avg_read_length = total_read_length / total_num_reads

t1 = time.time()
print("=" * 79)
_log(f"Done: processed {num_files_seen:,} files.")
_log(f"Total time taken: {t1 - t0:,.2f} sec.")
_log(f"Total number of reads: {total_num_reads:,}")
_log(f"Total read length: {total_read_length:,}")
_log(f"Average read length: {avg_read_length:,}")
	#! /usr/bin/env python3
	#
	# Computes the total number of reads, total read length, and average read
	# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
	# library (https://github.com/lmdu/pyfastx). I designed this in the context of
	# computing read statistics, but if you have a set of other sequences (e.g.
	# contigs) then I guess this would still work for that.
	#
	# USAGE:
	# ./read_stats.py file1.fa [file2.fa ...]
	#
	# Each input file's suffix should match one of the suffixes in FASTA_SUFFIXES
	# or FASTQ_SUFFIXES defined below, optionally with a ".gz" added to the end
	# (e.g. "file1.fastq.gz" is fine). Also, you can use globbing (e.g. "*.fa") and
	# that should work also.


	import sys
	import time
	import glob
	import pyfastx


	FASTA_SUFFIXES = ["fa", "fasta", "fna"]
	FASTQ_SUFFIXES = ["fq", "fastq"]


	t0 = time.time()


	def _log(msg):
	print(f"{time.time() - t0:,.2f}s. {msg}")


	_log("Starting...")
	if len(sys.argv) > 1:
	ir_paths = sys.argv[1:]
	else:
	raise ValueError("We need at least 1 arg (FASTA/FASTQ filepath)")

	num_files_seen = 0
	total_num_reads = 0
	total_read_length = 0

	# Go through each general path -- this could be a single file, or a reference
	# to multiple files (e.g. "myfiles/*.fastq.gz", which could correspond to
	# arbitrarily many files)
	for irp in ir_paths:
	# Use glob.glob() to convert these to lists of real file paths.
	# This is of course vulnerable to a race condition if some antagonist
	# deletes a file in between calling glob.glob() here and when we attempt to
	# parse it below. But that should trigger an error, and ... look, this is a
	# script for counting read lengths for a single paper. It should be fine.
	for irf in glob.glob(irp):
	tf0 = time.time()
	num_files_seen += 1
	print("-" * 79)
	_log(f"On file #{num_files_seen:,} ({irf})...")

	read_kwargs = {}
	irf_lower = irf.lower()
	fconst = None
	for s in FASTA_SUFFIXES:
	if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
	fconst = pyfastx.Fasta
	break

	if fconst is None:
	for s in FASTQ_SUFFIXES:
	if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
	fconst = pyfastx.Fastq
	break

	if fconst is None:
	raise ValueError(f"Not a FASTA/FASTQ file?: {irf}")

	f = fconst(irf)
	file_num_reads = len(f)
	file_read_length = f.size

	tf1 = time.time()
	_log(
	f"Done processing file #{num_files_seen}; took {tf1 - tf0:,.2f}s."
	)
	_log(
	f"File had {file_num_reads:,} reads of total length "
	f"{file_read_length:,}."
	)
	total_num_reads += file_num_reads
	total_read_length += file_read_length

	avg_read_length = total_read_length / total_num_reads

	t1 = time.time()
	print("=" * 79)
	_log(f"Done: processed {num_files_seen:,} files.")
	_log(f"Total time taken: {t1 - t0:,.2f} sec.")
	_log(f"Total number of reads: {total_num_reads:,}")
	_log(f"Total read length: {total_read_length:,}")
	_log(f"Average read length: {avg_read_length:,}")