danielecook/Check_Fastqs.py

## Check_Fastqs.py
#!/usr/bin/python
import re
from itertools import groupby as g
import subprocess
import sys
from collections import OrderedDict

def most_common(L):
  return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]

# Set this variable to ensure no quality score lines get examined.
fq_at_start = "HWI"

r = subprocess.check_output("""
for r in `ls *fastq.gz`;
do
echo "$r"
gunzip -cq $r | head -n 1000 | grep '^@%s' - | grep -v '^@@' |  egrep '(:.+){4}' -
echo "|"
done;
""" % fq_at_start, shell=True)


f = open("fastq_summary.txt", "w")


orig_line =  OrderedDict([("file", ""),
			("instrument", ""),
			("flowcell_id", ""),
			("flowcell_lane", ""),
			("x_coord", ""),
			("y_coord", ""),
			("index", ""),
			("pair", ""),
			("run_id", ""),
			("filtered",""),
			("control_bits","")])
l_keys = orig_line.keys()

f.write('\t'.join(l_keys) + "\n")

for fq_group in [filter(len,x.split('\n')) for x in r.split("|")][:-1]:
	index_set = []
	for heading in fq_group[1:]:
		l = re.split(r'(\:|#| )',heading)
		line = orig_line
		line["file"] = fq_group[0]
                if l[0].startswith("@SRR"):
                    line["run_id"] = l[0]
                    line["instrument"] = l[2]
                    line["flowcell_lane"] = l[4]
                    index_set.append("")
		elif len(l) == 11:
			line["instrument"] = l[0]
			line["flowcell_lane"] = l[2]
			line["flowcell_tile"] = l[4]
			line["x_coord"] = l[6]
			line["y_coord"] = l[8]
			try:
				line["pair"] = l[10].split("/")[1]
				index_set.append(l[10].split("/")[0])
			except:
				break
		elif len(l) == 21:
			line["instrument"] = l[0]
			line["run_id"] = l[2]
			line["flowcell_id"] = l[4]
			line["flowcell_lane"] = l[6]
			line["flowcell_tile"] = l[8]
			line["x_coord"] = l[10]
			line["y_coord"] = l[12]
			line["pair"] = l[14]
			line["filtered"] = l[16]
			line["control_bits"] = l[16]
			line["index"] = l[20]
			index_set.append(l[20])
		else:
			print "error", l
	line["index"] = most_common(index_set)
	f.write('\t'.join([line[x] for x in l_keys]+ ["\n"]))
	#!/usr/bin/python
	import re
	from itertools import groupby as g
	import subprocess
	import sys
	from collections import OrderedDict

	def most_common(L):
	return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]

	# Set this variable to ensure no quality score lines get examined.
	fq_at_start = "HWI"

	r = subprocess.check_output("""
	for r in `ls *fastq.gz`;
	do
	echo "$r"
	gunzip -cq $r \| head -n 1000 \| grep '^@%s' - \| grep -v '^@@' \| egrep '(:.+){4}' -
	echo "\|"
	done;
	""" % fq_at_start, shell=True)


	f = open("fastq_summary.txt", "w")


	orig_line = OrderedDict([("file", ""),
	("instrument", ""),
	("flowcell_id", ""),
	("flowcell_lane", ""),
	("x_coord", ""),
	("y_coord", ""),
	("index", ""),
	("pair", ""),
	("run_id", ""),
	("filtered",""),
	("control_bits","")])
	l_keys = orig_line.keys()

	f.write('\t'.join(l_keys) + "\n")

	for fq_group in [filter(len,x.split('\n')) for x in r.split("\|")][:-1]:
	index_set = []
	for heading in fq_group[1:]:
	l = re.split(r'(\:\|#\| )',heading)
	line = orig_line
	line["file"] = fq_group[0]
	if l[0].startswith("@SRR"):
	line["run_id"] = l[0]
	line["instrument"] = l[2]
	line["flowcell_lane"] = l[4]
	index_set.append("")
	elif len(l) == 11:
	line["instrument"] = l[0]
	line["flowcell_lane"] = l[2]
	line["flowcell_tile"] = l[4]
	line["x_coord"] = l[6]
	line["y_coord"] = l[8]
	try:
	line["pair"] = l[10].split("/")[1]
	index_set.append(l[10].split("/")[0])
	except:
	break
	elif len(l) == 21:
	line["instrument"] = l[0]
	line["run_id"] = l[2]
	line["flowcell_id"] = l[4]
	line["flowcell_lane"] = l[6]
	line["flowcell_tile"] = l[8]
	line["x_coord"] = l[10]
	line["y_coord"] = l[12]
	line["pair"] = l[14]
	line["filtered"] = l[16]
	line["control_bits"] = l[16]
	line["index"] = l[20]
	index_set.append(l[20])
	else:
	print "error", l
	line["index"] = most_common(index_set)
	f.write('\t'.join([line[x] for x in l_keys]+ ["\n"]))