Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Created January 2, 2013 15:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregcaporaso/4435371 to your computer and use it in GitHub Desktop.
Save gregcaporaso/4435371 to your computer and use it in GitHub Desktop.
Given an input sequence file, splits sequences randomly into n different files. This is useful for generating files that can be used to test computationally expensive analysis processes as analyses can be run iteratively on each input sequence set as the process can then be run iteratively, but also provide preliminary results based on random su…
#!/usr/bin/env python
# File created on 02 Jan 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.6.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"
from os.path import split, splitext, join
from random import choice
from cogent.parse.fasta import MinimalFastaParser
from qiime.util import (parse_command_line_parameters,
make_option,
qiime_open,
create_dir)
script_info = {}
script_info['brief_description'] = "Given an input sequence file, splits sequences randomly into n different files. This is useful for generating files that can be used to test computationally expensive analysis processes as analyses can be run iteratively on each input sequence set as the process can then be run iteratively, but also provide preliminary results based on random subsets of the data as the analysis progresses."
script_info['script_description'] = ""
script_info['script_usage'] = [("","Split seqs.fna into 20 different files and write each to a new file in partitioned_seqs/.","%prog -i seqs.fna -o partitioned_seqs/ -n 20")]
script_info['output_description']= ""
script_info['required_options'] = [\
# Example required option
make_option('-i','--input_fp',type="existing_filepath",help='the input sequence file'),
make_option('-o','--output_dir',type="new_dirpath",help='the output sequence directory'),
make_option('-n','--n',type="int",
help='the number of roughly-equal-sized output files to generate'),
]
script_info['optional_options'] = []
script_info['version'] = __version__
def partition_seqs(input_seqs,output_fs):
for r in input_seqs:
choice(output_fs).write('>%s\n%s\n' % r)
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
input_basename, input_ext = splitext(split(opts.input_fp)[1])
create_dir(opts.output_dir)
output_fp_template = join(opts.output_dir,input_basename+'_%d'+input_ext)
output_fs = []
for i in range(opts.n):
output_fp = output_fp_template % (i+1)
output_fs.append(open(output_fp,'w'))
partition_seqs(MinimalFastaParser(qiime_open(opts.input_fp)),output_fs)
for output_f in output_fs:
output_f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment