ialbert/fastqc-style-deduplication.py

## fastqc-style-deduplication.py
#
# FastQC style de-duplication stats and plot
#
#
# The input file for this program needs to be generated via the command line with
# a command like so:
#
# cat data.fq | bioawk -c fastx '{ print substr($seq,1, 50) } ' | sort | uniq -c | sort -k1,1 -rn > data.uniq.txt
#
# see bioawk at: https://github.com/lh3/bioawk
#

import sys
import matplotlib.pyplot as plt

def get_count(line):
    "Function to extract the count from a line in the file"
    count, rest = line.strip().split(" ")
    return int(count)

fname = "data.uniq.txt"

# Get the counts into a list of number
counts = map(get_count, open(fname))

# These are the total number of reads.
total = sum(counts)

# These are the number of distinct sequences.
distinct = len(counts)

# This is the number of singletons
single = len(filter(lambda x: x == 1, counts))

print "Total:%s, Distinct:%s, Singleton:%s" % (total, distinct, single)

# Generate the breaks
lower = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500, 1000, 5000, 10000]
upper = lower[1:] + [sys.maxint]

y_sum, y_grp = [], []

for lo, hi in zip(lower, upper):

    # Condition for a value to fall into a break.
    cond = lambda x: lo <= x < hi

    # The data that passes the condition
    vals = filter(cond, counts)

    # Collect either the sum or reads or number of groups
    y_sum.append(sum(vals))
    y_grp.append(len(vals))

x_val = range(len(lower))
x_tick = map(str, lower)

y_sum = [100.0 * x / total for x in y_sum]
y_grp = [100.0 * x / distinct for x in y_grp]

p_sum, = plt.plot(y_sum, '-', lw=3, color='blue')
p_count, = plt.plot(y_grp, '-', lw=3, color='red')
plt.xticks(x_val, x_tick)
plt.yticks(range(0, 100, 10))
plt.grid(True)
plt.title("Percent of seq remaining if deduplicated %4.2f%%" % (100.0 * distinct / total))
plt.legend([p_sum, p_count], ["Total", "Distinct"])
plt.show()
	#
	# FastQC style de-duplication stats and plot
	#
	#
	# The input file for this program needs to be generated via the command line with
	# a command like so:
	#
	# cat data.fq \| bioawk -c fastx '{ print substr($seq,1, 50) } ' \| sort \| uniq -c \| sort -k1,1 -rn > data.uniq.txt
	#
	# see bioawk at: https://github.com/lh3/bioawk
	#

	import sys
	import matplotlib.pyplot as plt

	def get_count(line):
	"Function to extract the count from a line in the file"
	count, rest = line.strip().split(" ")
	return int(count)

	fname = "data.uniq.txt"

	# Get the counts into a list of number
	counts = map(get_count, open(fname))

	# These are the total number of reads.
	total = sum(counts)

	# These are the number of distinct sequences.
	distinct = len(counts)

	# This is the number of singletons
	single = len(filter(lambda x: x == 1, counts))

	print "Total:%s, Distinct:%s, Singleton:%s" % (total, distinct, single)

	# Generate the breaks
	lower = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500, 1000, 5000, 10000]
	upper = lower[1:] + [sys.maxint]

	y_sum, y_grp = [], []

	for lo, hi in zip(lower, upper):

	# Condition for a value to fall into a break.
	cond = lambda x: lo <= x < hi

	# The data that passes the condition
	vals = filter(cond, counts)

	# Collect either the sum or reads or number of groups
	y_sum.append(sum(vals))
	y_grp.append(len(vals))

	x_val = range(len(lower))
	x_tick = map(str, lower)

	y_sum = [100.0 * x / total for x in y_sum]
	y_grp = [100.0 * x / distinct for x in y_grp]

	p_sum, = plt.plot(y_sum, '-', lw=3, color='blue')
	p_count, = plt.plot(y_grp, '-', lw=3, color='red')
	plt.xticks(x_val, x_tick)
	plt.yticks(range(0, 100, 10))
	plt.grid(True)
	plt.title("Percent of seq remaining if deduplicated %4.2f%%" % (100.0 * distinct / total))
	plt.legend([p_sum, p_count], ["Total", "Distinct"])
	plt.show()