Skip to content

Instantly share code, notes, and snippets.

@ialbert
Created July 23, 2014 19:11
Show Gist options
  • Save ialbert/4b45cea31e79920607b8 to your computer and use it in GitHub Desktop.
Save ialbert/4b45cea31e79920607b8 to your computer and use it in GitHub Desktop.
FastQC deduplication plot
#
# FastQC style de-duplication stats and plot
#
#
# The input file for this program needs to be generated via the command line with
# a command like so:
#
# cat data.fq | bioawk -c fastx '{ print substr($seq,1, 50) } ' | sort | uniq -c | sort -k1,1 -rn > data.uniq.txt
#
# see bioawk at: https://github.com/lh3/bioawk
#
import sys
import matplotlib.pyplot as plt
def get_count(line):
"Function to extract the count from a line in the file"
count, rest = line.strip().split(" ")
return int(count)
fname = "data.uniq.txt"
# Get the counts into a list of number
counts = map(get_count, open(fname))
# These are the total number of reads.
total = sum(counts)
# These are the number of distinct sequences.
distinct = len(counts)
# This is the number of singletons
single = len(filter(lambda x: x == 1, counts))
print "Total:%s, Distinct:%s, Singleton:%s" % (total, distinct, single)
# Generate the breaks
lower = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500, 1000, 5000, 10000]
upper = lower[1:] + [sys.maxint]
y_sum, y_grp = [], []
for lo, hi in zip(lower, upper):
# Condition for a value to fall into a break.
cond = lambda x: lo <= x < hi
# The data that passes the condition
vals = filter(cond, counts)
# Collect either the sum or reads or number of groups
y_sum.append(sum(vals))
y_grp.append(len(vals))
x_val = range(len(lower))
x_tick = map(str, lower)
y_sum = [100.0 * x / total for x in y_sum]
y_grp = [100.0 * x / distinct for x in y_grp]
p_sum, = plt.plot(y_sum, '-', lw=3, color='blue')
p_count, = plt.plot(y_grp, '-', lw=3, color='red')
plt.xticks(x_val, x_tick)
plt.yticks(range(0, 100, 10))
plt.grid(True)
plt.title("Percent of seq remaining if deduplicated %4.2f%%" % (100.0 * distinct / total))
plt.legend([p_sum, p_count], ["Total", "Distinct"])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment