Skip to content

Instantly share code, notes, and snippets.

@walterst
Created December 17, 2018 16:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walterst/22c9fb9d1f817eae55c14a84b1b106d9 to your computer and use it in GitHub Desktop.
Save walterst/22c9fb9d1f817eae55c14a84b1b106d9 to your computer and use it in GitHub Desktop.
Randomly subsamples a directory of fastq.gz files, writes out subsampled fastq files to output directory
#!/usr/bin/env
from sys import argv
from random import random
#from gzip import open as gz_open
from glob import glob
import gzip
import os
from os.path import abspath, basename, exists, dirname, join, splitext, isfile
from skbio.util import remove_files, create_dir
from cogent.parse.fastq import MinimalFastqParser
threshold = 0.03 # random value between 0 and 1 to retain read, using 3% now
input_dir = argv[1]
output_dir = argv[2]
create_dir(output_dir)
fastq_files = []
fastq_files = glob(input_dir + "/*.fastq.gz")
for curr_file in fastq_files:
if curr_file.endswith('.gz'):
query_reads = gzip.open(curr_file, "rb")
else:
query_reads = open(curr_file, "U")
curr_outfile = join(output_dir, basename(curr_file).replace(".fastq.gz", ".fastq"))
out_f = open(curr_outfile, "w")
for read_data in MinimalFastqParser(query_reads, strict=False):
if random() < threshold:
curr_read = "@%s\n" % read_data[0]
curr_read += "%s\n" % read_data[1]
curr_read += "+\n"
curr_read += "%s\n" % read_data[2]
out_f.write(curr_read)
query_reads.close()
out_f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment