bananabenana/slice_multi_genbank.py

## slice_multi_genbank.py
#!/usr/bin/env python
# Slices/subsections genbank files by either gene names or base pair range. This handles multi-genbank files which have multiple contigs

# Usage:
#   slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk # This will extract the region between two genes including these genes
#   slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n 2 -o gene_sliced_output.gbk # This will extract 2 genes up- and down-stream of your specified two genes
#   slice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk # This will extract a single gene
#   slice_multi_genbank.py -i infile.gbk -g gene1: -n 3 -o gene_extract_output.gbk # This will extract a single gene and 3 genes up- and down-stream of the gene
#   slice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk # This will slice via base pair range and locus/contig. This is required to handle multi-genbank files

# Inputs:
#   1) Genbank file [.gbk, .gbff]
#   2) One of:
#       - Two gene names
#       - Two base pair range and locus

# Requirements:
#   - biopython (any)

# Author: Ben Vezina
#   - Scholar: https://scholar.google.com/citations?user=Rf9oh94AAAAJ&hl=en&oi=ao
#   - ORCID: https://orcid.org/0000-0003-4224-2537

# Citation: https://gist.github.com/bananabenana/20ff257f237d5a6e6f449fd7066577a1
# Adapted from https://gist.github.com/jrjhealey/2df3c65c7a70cbca4862e94620e4d7b2

from Bio import SeqIO
import sys, argparse
from argparse import RawTextHelpFormatter


def get_args():
    """Parse command line arguments"""

    try:
        parser = argparse.ArgumentParser(
            description='''
description:
  Subset genbanks between 1 gene, 2 genes or base pair ranges.\n
usage:

  To slice/extract by gene names:
  \tslice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk

  To slice/extract by gene name +- a number of genes up- and down-stream:
  slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n [integer] -o gene_sliced_output.gbk

  To extract a gene by its name:
  \tslice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk

  To extract a gene by its name +- a number of genes up- and down-stream:
  \tslice_multi_genbank.py -i infile.gbk -g gene1: -n [integer] -o gene_extract_output.gbk

  To slice by base pair range coordinates:
  \tslice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk''', formatter_class=RawTextHelpFormatter)
        parser.add_argument('-i', '--infile', action='store',
                            help='Input genbank [.gbk, .gbff] file to slice or subsection. Can be a multi-genbank file with multiple contigs')
        parser.add_argument('-g', '--genes', action='store',
                            help='''The two gene names to slice between. Example: -g hemF:maeB
A single gene can be extracted using one gene name. Example: -g hemF:''')
        parser.add_argument('-n', '--num_genes', help="Number of genes to slice before and after the selected genes. Can be used with -g only. Optional. Example: -n 2", type=int, default=0)
        parser.add_argument('-r', '--range', action='store',
                            help='The two base pair coordinates (range) to slice between. Contig name or locus must be provided. Example: 612899:630276:contig_1')
        parser.add_argument("-o", "--outfile", required=True, help="Output GenBank filename")
        if len(sys.argv) == 1:
            parser.print_help(sys.stderr)
            exit(1)
    except:
        sys.stderr.write("An exception occurred with argument parsing. Check your provided options.")

    return parser.parse_args()


def slice_gene(record, genes, num_genes):
    loci = [feat for feat in record.features if feat.type == "CDS"]
    try:
        gene_start = min([int(l.location.start) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes])
        gene_end = max([int(l.location.end) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes])

        # Calculate slicing range using index from loci
        start_locus = loci[min([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) - num_genes]
        end_locus = loci[max([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) + num_genes]

        start = max(0, int(start_locus.location.start))
        end = min(len(record), int(end_locus.location.end))

        subrecord = record[start:end]
        print(f"Extracted {genes[0]}:{genes[1]} with {num_genes} genes up/down from {record.id}")
        return subrecord
    except ValueError:
        return None

def slice_range(record, start, end):
    subrecord = record[int(start):int(end)]
    print(f"Extracted {start}:{end} from {record.id}")
    return subrecord

def main():
    args = get_args()

    # Check that both -g and -r options are not used simultaneously
    if args.genes and args.range:
        sys.stderr.write("Error: Both -g and -r options cannot be used simultaneously.\n")
        exit(1)

    modified_records = []

    records = SeqIO.parse(args.infile, 'genbank')

    for record in records:
        sys.stderr.write('Searching {}.\n'.format(record.id))
        if args.genes:
            subrecord = slice_gene(record, args.genes.split(':'), args.num_genes)
            if subrecord:
                modified_records.append(subrecord)
        elif args.range:
            range_parts = args.range.split(':')
            if len(range_parts) != 3:
                sys.stderr.write("Error: When using -r option, provide start:end:contig.\n")
                exit(1)
            start, end, locus = range_parts[0], range_parts[1], range_parts[2]
            if record.name == locus:
                subrecord = slice_range(record, start, end)
                if subrecord:
                    modified_records.append(subrecord)

    # Write modified records to the specified output GenBank file
    with open(args.outfile, "w") as output_handle:
        SeqIO.write(modified_records, output_handle, "genbank")

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# Slices/subsections genbank files by either gene names or base pair range. This handles multi-genbank files which have multiple contigs

	# Usage:
	# slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk # This will extract the region between two genes including these genes
	# slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n 2 -o gene_sliced_output.gbk # This will extract 2 genes up- and down-stream of your specified two genes
	# slice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk # This will extract a single gene
	# slice_multi_genbank.py -i infile.gbk -g gene1: -n 3 -o gene_extract_output.gbk # This will extract a single gene and 3 genes up- and down-stream of the gene
	# slice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk # This will slice via base pair range and locus/contig. This is required to handle multi-genbank files

	# Inputs:
	# 1) Genbank file [.gbk, .gbff]
	# 2) One of:
	# - Two gene names
	# - Two base pair range and locus

	# Requirements:
	# - biopython (any)

	# Author: Ben Vezina
	# - Scholar: https://scholar.google.com/citations?user=Rf9oh94AAAAJ&hl=en&oi=ao
	# - ORCID: https://orcid.org/0000-0003-4224-2537

	# Citation: https://gist.github.com/bananabenana/20ff257f237d5a6e6f449fd7066577a1
	# Adapted from https://gist.github.com/jrjhealey/2df3c65c7a70cbca4862e94620e4d7b2

	from Bio import SeqIO
	import sys, argparse
	from argparse import RawTextHelpFormatter


	def get_args():
	"""Parse command line arguments"""

	try:
	parser = argparse.ArgumentParser(
	description='''
	description:
	Subset genbanks between 1 gene, 2 genes or base pair ranges.\n
	usage:

	To slice/extract by gene names:
	\tslice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk

	To slice/extract by gene name +- a number of genes up- and down-stream:
	slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n [integer] -o gene_sliced_output.gbk

	To extract a gene by its name:
	\tslice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk

	To extract a gene by its name +- a number of genes up- and down-stream:
	\tslice_multi_genbank.py -i infile.gbk -g gene1: -n [integer] -o gene_extract_output.gbk

	To slice by base pair range coordinates:
	\tslice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk''', formatter_class=RawTextHelpFormatter)
	parser.add_argument('-i', '--infile', action='store',
	help='Input genbank [.gbk, .gbff] file to slice or subsection. Can be a multi-genbank file with multiple contigs')
	parser.add_argument('-g', '--genes', action='store',
	help='''The two gene names to slice between. Example: -g hemF:maeB
	A single gene can be extracted using one gene name. Example: -g hemF:''')
	parser.add_argument('-n', '--num_genes', help="Number of genes to slice before and after the selected genes. Can be used with -g only. Optional. Example: -n 2", type=int, default=0)
	parser.add_argument('-r', '--range', action='store',
	help='The two base pair coordinates (range) to slice between. Contig name or locus must be provided. Example: 612899:630276:contig_1')
	parser.add_argument("-o", "--outfile", required=True, help="Output GenBank filename")
	if len(sys.argv) == 1:
	parser.print_help(sys.stderr)
	exit(1)
	except:
	sys.stderr.write("An exception occurred with argument parsing. Check your provided options.")

	return parser.parse_args()


	def slice_gene(record, genes, num_genes):
	loci = [feat for feat in record.features if feat.type == "CDS"]
	try:
	gene_start = min([int(l.location.start) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes])
	gene_end = max([int(l.location.end) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes])

	# Calculate slicing range using index from loci
	start_locus = loci[min([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) - num_genes]
	end_locus = loci[max([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) + num_genes]

	start = max(0, int(start_locus.location.start))
	end = min(len(record), int(end_locus.location.end))

	subrecord = record[start:end]
	print(f"Extracted {genes[0]}:{genes[1]} with {num_genes} genes up/down from {record.id}")
	return subrecord
	except ValueError:
	return None

	def slice_range(record, start, end):
	subrecord = record[int(start):int(end)]
	print(f"Extracted {start}:{end} from {record.id}")
	return subrecord

	def main():
	args = get_args()

	# Check that both -g and -r options are not used simultaneously
	if args.genes and args.range:
	sys.stderr.write("Error: Both -g and -r options cannot be used simultaneously.\n")
	exit(1)

	modified_records = []

	records = SeqIO.parse(args.infile, 'genbank')

	for record in records:
	sys.stderr.write('Searching {}.\n'.format(record.id))
	if args.genes:
	subrecord = slice_gene(record, args.genes.split(':'), args.num_genes)
	if subrecord:
	modified_records.append(subrecord)
	elif args.range:
	range_parts = args.range.split(':')
	if len(range_parts) != 3:
	sys.stderr.write("Error: When using -r option, provide start:end:contig.\n")
	exit(1)
	start, end, locus = range_parts[0], range_parts[1], range_parts[2]
	if record.name == locus:
	subrecord = slice_range(record, start, end)
	if subrecord:
	modified_records.append(subrecord)

	# Write modified records to the specified output GenBank file
	with open(args.outfile, "w") as output_handle:
	SeqIO.write(modified_records, output_handle, "genbank")

	if __name__ == '__main__':
	main()