Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active December 23, 2015 10:59
Show Gist options
  • Save brentp/6625544 to your computer and use it in GitHub Desktop.
Save brentp/6625544 to your computer and use it in GitHub Desktop.
split a fastq file into evenly size random chunks.
"""
split a single fastq file in to random, non-overlapping subsets
arguments:
+ fastq file
+ number of splits
+ number of reps
e.g.:
python fq.split.py input.fastq 3 4
will create 12 new files in 4 sets of 3. Each
set of 3 will contain all of the original records.
"""
import gzip
import random
from itertools import islice, izip
xopen = lambda fq: gzip.open(fq) if fq.endswith('.gz') else open(fq)
def fqiter(fq, n=4):
with xopen(fq) as fh:
fqclean = (x.strip("\r\n") for x in fh if x.strip())
while True:
rec = [x for x in islice(fqclean, n)]
if not rec: raise StopIteration
assert all(rec) and len(rec) == 4
yield rec
def fqsplit(fq, nchunks, nreps, prefix=None):
if prefix == None: prefix = fq + ".split"
prefix += "chunk-%i.rep-%i.fq"
fq_size = sum(1 for x in xopen(fq))
assert fq_size % 4 == 0
fq_size /= 4 # number of records
chunk_size = 1 + (fq_size) // nchunks
print >>sys.stderr, "chunk_size:", chunk_size
for rep in range(1, nreps + 1):
files = [open(prefix % (c, rep), 'w') for c in range(1, nchunks + 1)]
ints = range(fq_size)
random.shuffle(ints)
for i, fqr in izip(ints, fqiter(fq)):
chunk, chunk_i = divmod(i, chunk_size)
print >>files[chunk], "\n".join(fqr)
[f.close() for f in files]
if __name__ == "__main__":
import sys
fq = sys.argv[1]
nchunks = int(sys.argv[2])
nreps = int(sys.argv[3])
fqsplit(fq, nchunks, nreps)
@Hanghangfels
Copy link

Hi, I tried to use this script, but I got an error. The command I used is python fq.split.py. FTI.fastq 10 3.

I hope you can help. Thanks.

Hanghang

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment