Skip to content

Instantly share code, notes, and snippets.

@danielecook
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielecook/68deab00bc57a1f76ba6 to your computer and use it in GitHub Desktop.
Save danielecook/68deab00bc57a1f76ba6 to your computer and use it in GitHub Desktop.
This code will pull out the header information from the first 1000 lines of all the fastq's in the folder where it is executed. Then it takes the most commonly found index and outputs a summary for each fastq.
#!/usr/bin/python
import re
from itertools import groupby as g
import subprocess
import sys
from collections import OrderedDict
def most_common(L):
return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]
# Set this variable to ensure no quality score lines get examined.
fq_at_start = "HWI"
r = subprocess.check_output("""
for r in `ls *fastq.gz`;
do
echo "$r"
gunzip -cq $r | head -n 1000 | grep '^@%s' - | grep -v '^@@' | egrep '(:.+){4}' -
echo "|"
done;
""" % fq_at_start, shell=True)
f = open("fastq_summary.txt", "w")
orig_line = OrderedDict([("file", ""),
("instrument", ""),
("flowcell_id", ""),
("flowcell_lane", ""),
("x_coord", ""),
("y_coord", ""),
("index", ""),
("pair", ""),
("run_id", ""),
("filtered",""),
("control_bits","")])
l_keys = orig_line.keys()
f.write('\t'.join(l_keys) + "\n")
for fq_group in [filter(len,x.split('\n')) for x in r.split("|")][:-1]:
index_set = []
for heading in fq_group[1:]:
l = re.split(r'(\:|#| )',heading)
line = orig_line
line["file"] = fq_group[0]
if l[0].startswith("@SRR"):
line["run_id"] = l[0]
line["instrument"] = l[2]
line["flowcell_lane"] = l[4]
index_set.append("")
elif len(l) == 11:
line["instrument"] = l[0]
line["flowcell_lane"] = l[2]
line["flowcell_tile"] = l[4]
line["x_coord"] = l[6]
line["y_coord"] = l[8]
try:
line["pair"] = l[10].split("/")[1]
index_set.append(l[10].split("/")[0])
except:
break
elif len(l) == 21:
line["instrument"] = l[0]
line["run_id"] = l[2]
line["flowcell_id"] = l[4]
line["flowcell_lane"] = l[6]
line["flowcell_tile"] = l[8]
line["x_coord"] = l[10]
line["y_coord"] = l[12]
line["pair"] = l[14]
line["filtered"] = l[16]
line["control_bits"] = l[16]
line["index"] = l[20]
index_set.append(l[20])
else:
print "error", l
line["index"] = most_common(index_set)
f.write('\t'.join([line[x] for x in l_keys]+ ["\n"]))
@danielecook
Copy link
Author

Updated to work with two variants of fastq lines.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment