Skip to content

Instantly share code, notes, and snippets.

@sminot
Created July 14, 2017 17:32
Show Gist options
  • Save sminot/d5c794c22c48eda8a515cd16bed512f7 to your computer and use it in GitHub Desktop.
Save sminot/d5c794c22c48eda8a515cd16bed512f7 to your computer and use it in GitHub Desktop.
Split a FASTQ file by header
#!/usr/bin/python
"""Split up a FASTQ file based on the first field of the header."""
from collections import defaultdict
import gzip
import sys
import os
fp = sys.argv[1]
if not os.path.exists(fp):
raise Exception("{} does not exist".format(fp))
folder = "{}.split".format(fp)
if not os.path.exists(folder):
os.mkdir(folder)
if fp.endswith('.gz'):
f = gzip.open(fp)
else:
f = open(fp)
record = []
records = defaultdict(list)
for line in f:
name = line[1:].split('_')[0]
records[name].append(line)
records[name].append(f.readline())
records[name].append(f.readline())
records[name].append(f.readline())
if len(records[name]) > 100000:
# print "Writing {} lines".format(len(records[name]))
with open("{}/{}.fastq".format(folder, name), 'a') as fo:
fo.write(''.join(records[name]))
records[name] = []
f.close()
for name in records.keys():
with open("{}/{}.fastq".format(folder, name), 'a') as fo:
fo.write(''.join(records[name]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment