Last active
August 29, 2015 14:01
-
-
Save danielecook/68deab00bc57a1f76ba6 to your computer and use it in GitHub Desktop.
This code will pull out the header information from the first 1000 lines of all the fastq's in the folder where it is executed. Then it takes the most commonly found index and outputs a summary for each fastq.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re | |
from itertools import groupby as g | |
import subprocess | |
import sys | |
from collections import OrderedDict | |
def most_common(L): | |
return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0] | |
# Set this variable to ensure no quality score lines get examined. | |
fq_at_start = "HWI" | |
r = subprocess.check_output(""" | |
for r in `ls *fastq.gz`; | |
do | |
echo "$r" | |
gunzip -cq $r | head -n 1000 | grep '^@%s' - | grep -v '^@@' | egrep '(:.+){4}' - | |
echo "|" | |
done; | |
""" % fq_at_start, shell=True) | |
f = open("fastq_summary.txt", "w") | |
orig_line = OrderedDict([("file", ""), | |
("instrument", ""), | |
("flowcell_id", ""), | |
("flowcell_lane", ""), | |
("x_coord", ""), | |
("y_coord", ""), | |
("index", ""), | |
("pair", ""), | |
("run_id", ""), | |
("filtered",""), | |
("control_bits","")]) | |
l_keys = orig_line.keys() | |
f.write('\t'.join(l_keys) + "\n") | |
for fq_group in [filter(len,x.split('\n')) for x in r.split("|")][:-1]: | |
index_set = [] | |
for heading in fq_group[1:]: | |
l = re.split(r'(\:|#| )',heading) | |
line = orig_line | |
line["file"] = fq_group[0] | |
if l[0].startswith("@SRR"): | |
line["run_id"] = l[0] | |
line["instrument"] = l[2] | |
line["flowcell_lane"] = l[4] | |
index_set.append("") | |
elif len(l) == 11: | |
line["instrument"] = l[0] | |
line["flowcell_lane"] = l[2] | |
line["flowcell_tile"] = l[4] | |
line["x_coord"] = l[6] | |
line["y_coord"] = l[8] | |
try: | |
line["pair"] = l[10].split("/")[1] | |
index_set.append(l[10].split("/")[0]) | |
except: | |
break | |
elif len(l) == 21: | |
line["instrument"] = l[0] | |
line["run_id"] = l[2] | |
line["flowcell_id"] = l[4] | |
line["flowcell_lane"] = l[6] | |
line["flowcell_tile"] = l[8] | |
line["x_coord"] = l[10] | |
line["y_coord"] = l[12] | |
line["pair"] = l[14] | |
line["filtered"] = l[16] | |
line["control_bits"] = l[16] | |
line["index"] = l[20] | |
index_set.append(l[20]) | |
else: | |
print "error", l | |
line["index"] = most_common(index_set) | |
f.write('\t'.join([line[x] for x in l_keys]+ ["\n"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated to work with two variants of fastq lines.