Created
July 23, 2014 19:11
-
-
Save ialbert/4b45cea31e79920607b8 to your computer and use it in GitHub Desktop.
FastQC deduplication plot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# FastQC style de-duplication stats and plot | |
# | |
# | |
# The input file for this program needs to be generated via the command line with | |
# a command like so: | |
# | |
# cat data.fq | bioawk -c fastx '{ print substr($seq,1, 50) } ' | sort | uniq -c | sort -k1,1 -rn > data.uniq.txt | |
# | |
# see bioawk at: https://github.com/lh3/bioawk | |
# | |
import sys | |
import matplotlib.pyplot as plt | |
def get_count(line): | |
"Function to extract the count from a line in the file" | |
count, rest = line.strip().split(" ") | |
return int(count) | |
fname = "data.uniq.txt" | |
# Get the counts into a list of number | |
counts = map(get_count, open(fname)) | |
# These are the total number of reads. | |
total = sum(counts) | |
# These are the number of distinct sequences. | |
distinct = len(counts) | |
# This is the number of singletons | |
single = len(filter(lambda x: x == 1, counts)) | |
print "Total:%s, Distinct:%s, Singleton:%s" % (total, distinct, single) | |
# Generate the breaks | |
lower = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500, 1000, 5000, 10000] | |
upper = lower[1:] + [sys.maxint] | |
y_sum, y_grp = [], [] | |
for lo, hi in zip(lower, upper): | |
# Condition for a value to fall into a break. | |
cond = lambda x: lo <= x < hi | |
# The data that passes the condition | |
vals = filter(cond, counts) | |
# Collect either the sum or reads or number of groups | |
y_sum.append(sum(vals)) | |
y_grp.append(len(vals)) | |
x_val = range(len(lower)) | |
x_tick = map(str, lower) | |
y_sum = [100.0 * x / total for x in y_sum] | |
y_grp = [100.0 * x / distinct for x in y_grp] | |
p_sum, = plt.plot(y_sum, '-', lw=3, color='blue') | |
p_count, = plt.plot(y_grp, '-', lw=3, color='red') | |
plt.xticks(x_val, x_tick) | |
plt.yticks(range(0, 100, 10)) | |
plt.grid(True) | |
plt.title("Percent of seq remaining if deduplicated %4.2f%%" % (100.0 * distinct / total)) | |
plt.legend([p_sum, p_count], ["Total", "Distinct"]) | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment