Skip to content

Instantly share code, notes, and snippets.

@bluegenes
Created September 30, 2021 21:37
Show Gist options
  • Save bluegenes/0ddd24affdd380b4e6f8f9d4b5d59c14 to your computer and use it in GitHub Desktop.
Save bluegenes/0ddd24affdd380b4e6f8f9d4b5d59c14 to your computer and use it in GitHub Desktop.
split a fastq file into one entry per file
#! /usr/bin/env python
import os
import sys
import argparse
import screed
def make_outdir(output_dirname):
if not os.path.exists(output_dirname):
try:
os.makedirs(output_dirname)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def main(args):
print(f"Splitting {args.fasta} by entry. Writing files to {args.output_dir} \n")
output_dir = args.output_dir
if not args.prefix:
prefix = ""
else:
prefix = args.prefix + "."
make_outdir(args.output_dir)
# loop through; split fasta into groups
with open(args.output_csv, 'w') as info_out:
# store name,length,filename in a csv
info_out.write("name,length,filename\n")
for n, record in enumerate(screed.open(args.fasta)):
if n > 0 and n % 1000 == 0:
print(f"working on {str(n+1)}th entry\n")
name = record.name.split(" ")[0]
name = name.replace("/", ".")
length = len(record.sequence)
this_filename = f"{output_dir}/{name}.fq"
with open(this_filename, 'w') as out_fq:
out_fq.write(f">{name}\n{record.sequence}\n")
info_out.write(f"{name},{length},{this_filename}\n")
print(f"{str(n+1)} entries written to individual fasta files\n")
def cmdline(sys_args):
"Command line entry point w/argparse action."
p = argparse.ArgumentParser()
p.add_argument("fasta")
p.add_argument("--output-dir", default= "")
p.add_argument("--output-csv", required=True)
p.add_argument("--prefix")
args = p.parse_args()
return main(args)
if __name__ == '__main__':
returncode = cmdline(sys.argv[1:])
sys.exit(returncode)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment