walterst/random_subsample_fastq.py

## random_subsample_fastq.py
#!/usr/bin/env

from sys import argv
from random import random

#from gzip import open as gz_open
from glob import glob

import gzip
import os

from os.path import abspath, basename, exists, dirname, join, splitext, isfile


from skbio.util import remove_files, create_dir
from cogent.parse.fastq import MinimalFastqParser

threshold = 0.03 # random value between 0 and 1 to retain read, using 3% now
input_dir = argv[1]
output_dir = argv[2]

create_dir(output_dir)

fastq_files = []

fastq_files = glob(input_dir + "/*.fastq.gz")


for curr_file in fastq_files:

    if curr_file.endswith('.gz'):
        query_reads = gzip.open(curr_file, "rb")
    else:
        query_reads = open(curr_file, "U")

    curr_outfile = join(output_dir, basename(curr_file).replace(".fastq.gz", ".fastq"))


    out_f = open(curr_outfile, "w")
    for read_data in MinimalFastqParser(query_reads, strict=False):
        if random() < threshold:
            curr_read = "@%s\n" % read_data[0]
            curr_read += "%s\n" % read_data[1]
            curr_read += "+\n"
            curr_read += "%s\n" % read_data[2]
            out_f.write(curr_read)

    query_reads.close()
    out_f.close()
	#!/usr/bin/env

	from sys import argv
	from random import random

	#from gzip import open as gz_open
	from glob import glob

	import gzip
	import os

	from os.path import abspath, basename, exists, dirname, join, splitext, isfile


	from skbio.util import remove_files, create_dir
	from cogent.parse.fastq import MinimalFastqParser

	threshold = 0.03 # random value between 0 and 1 to retain read, using 3% now
	input_dir = argv[1]
	output_dir = argv[2]

	create_dir(output_dir)

	fastq_files = []

	fastq_files = glob(input_dir + "/*.fastq.gz")



	for curr_file in fastq_files:

	if curr_file.endswith('.gz'):
	query_reads = gzip.open(curr_file, "rb")
	else:
	query_reads = open(curr_file, "U")

	curr_outfile = join(output_dir, basename(curr_file).replace(".fastq.gz", ".fastq"))


	out_f = open(curr_outfile, "w")
	for read_data in MinimalFastqParser(query_reads, strict=False):
	if random() < threshold:
	curr_read = "@%s\n" % read_data[0]
	curr_read += "%s\n" % read_data[1]
	curr_read += "+\n"
	curr_read += "%s\n" % read_data[2]
	out_f.write(curr_read)

	query_reads.close()
	out_f.close()