Skip to content

Instantly share code, notes, and snippets.

@ivan-krukov
Created November 21, 2012 20:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivan-krukov/4127306 to your computer and use it in GitHub Desktop.
Save ivan-krukov/4127306 to your computer and use it in GitHub Desktop.
Random sample of a FASTQ file
#Take a fraction of random sequence reads from a fastq file
from sh import wc
import argparse
import random
def first_word(string):
return string.strip().split()[0]
#read a file in chunks of deflines
def read_segments(filename,deflines):
data = []
with open(filename) as handle:
line = handle.readline()
while line != "":
chunk = []
for i in range(deflines):
chunk.append(line)
line = handle.readline()
data.append("".join(chunk))
return data
parser = argparse.ArgumentParser()
parser.add_argument("input_file",type=str,help="input fastq file")
parser.add_argument("output_signature",type=str,help="output fastq file signature")
parser.add_argument("fractions",type=int,nargs="+",help="fraction of the file to be used")
args = parser.parse_args()
#number of lines that contain a complete fastq record
deflines = 4
length = int(first_word(wc("-l",args.input_file)))
seq_reads = read_segments(args.input_file,deflines)
for n in args.fractions:
nlines = int(length/n/deflines)
sample = random.sample(seq_reads,nlines)
output_file = args.output_signature+"_"+str(n)
with open(output_file,"w") as handle:
for seq in sample:
handle.write(seq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment