Skip to content

Instantly share code, notes, and snippets.

@bananabenana
Forked from jrjhealey/slice_genes.py
Last active August 4, 2023 09:01
Show Gist options
  • Save bananabenana/20ff257f237d5a6e6f449fd7066577a1 to your computer and use it in GitHub Desktop.
Save bananabenana/20ff257f237d5a6e6f449fd7066577a1 to your computer and use it in GitHub Desktop.
Code to 'slice' or subset a multi genbank file using gene name or coordinates
#!/usr/bin/env python
# Slices/subsections genbank files by either gene names or base pair range. This handles multi-genbank files which have multiple contigs
# Usage:
# slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk # This will extract the region between two genes including these genes
# slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n 2 -o gene_sliced_output.gbk # This will extract 2 genes up- and down-stream of your specified two genes
# slice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk # This will extract a single gene
# slice_multi_genbank.py -i infile.gbk -g gene1: -n 3 -o gene_extract_output.gbk # This will extract a single gene and 3 genes up- and down-stream of the gene
# slice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk # This will slice via base pair range and locus/contig. This is required to handle multi-genbank files
# Inputs:
# 1) Genbank file [.gbk, .gbff]
# 2) One of:
# - Two gene names
# - Two base pair range and locus
# Requirements:
# - biopython (any)
# Author: Ben Vezina
# - Scholar: https://scholar.google.com/citations?user=Rf9oh94AAAAJ&hl=en&oi=ao
# - ORCID: https://orcid.org/0000-0003-4224-2537
# Citation: https://gist.github.com/bananabenana/20ff257f237d5a6e6f449fd7066577a1
# Adapted from https://gist.github.com/jrjhealey/2df3c65c7a70cbca4862e94620e4d7b2
from Bio import SeqIO
import sys, argparse
from argparse import RawTextHelpFormatter
def get_args():
"""Parse command line arguments"""
try:
parser = argparse.ArgumentParser(
description='''
description:
Subset genbanks between 1 gene, 2 genes or base pair ranges.\n
usage:
To slice/extract by gene names:
\tslice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk
To slice/extract by gene name +- a number of genes up- and down-stream:
slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n [integer] -o gene_sliced_output.gbk
To extract a gene by its name:
\tslice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk
To extract a gene by its name +- a number of genes up- and down-stream:
\tslice_multi_genbank.py -i infile.gbk -g gene1: -n [integer] -o gene_extract_output.gbk
To slice by base pair range coordinates:
\tslice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk''', formatter_class=RawTextHelpFormatter)
parser.add_argument('-i', '--infile', action='store',
help='Input genbank [.gbk, .gbff] file to slice or subsection. Can be a multi-genbank file with multiple contigs')
parser.add_argument('-g', '--genes', action='store',
help='''The two gene names to slice between. Example: -g hemF:maeB
A single gene can be extracted using one gene name. Example: -g hemF:''')
parser.add_argument('-n', '--num_genes', help="Number of genes to slice before and after the selected genes. Can be used with -g only. Optional. Example: -n 2", type=int, default=0)
parser.add_argument('-r', '--range', action='store',
help='The two base pair coordinates (range) to slice between. Contig name or locus must be provided. Example: 612899:630276:contig_1')
parser.add_argument("-o", "--outfile", required=True, help="Output GenBank filename")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
exit(1)
except:
sys.stderr.write("An exception occurred with argument parsing. Check your provided options.")
return parser.parse_args()
def slice_gene(record, genes, num_genes):
loci = [feat for feat in record.features if feat.type == "CDS"]
try:
gene_start = min([int(l.location.start) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes])
gene_end = max([int(l.location.end) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes])
# Calculate slicing range using index from loci
start_locus = loci[min([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) - num_genes]
end_locus = loci[max([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) + num_genes]
start = max(0, int(start_locus.location.start))
end = min(len(record), int(end_locus.location.end))
subrecord = record[start:end]
print(f"Extracted {genes[0]}:{genes[1]} with {num_genes} genes up/down from {record.id}")
return subrecord
except ValueError:
return None
def slice_range(record, start, end):
subrecord = record[int(start):int(end)]
print(f"Extracted {start}:{end} from {record.id}")
return subrecord
def main():
args = get_args()
# Check that both -g and -r options are not used simultaneously
if args.genes and args.range:
sys.stderr.write("Error: Both -g and -r options cannot be used simultaneously.\n")
exit(1)
modified_records = []
records = SeqIO.parse(args.infile, 'genbank')
for record in records:
sys.stderr.write('Searching {}.\n'.format(record.id))
if args.genes:
subrecord = slice_gene(record, args.genes.split(':'), args.num_genes)
if subrecord:
modified_records.append(subrecord)
elif args.range:
range_parts = args.range.split(':')
if len(range_parts) != 3:
sys.stderr.write("Error: When using -r option, provide start:end:contig.\n")
exit(1)
start, end, locus = range_parts[0], range_parts[1], range_parts[2]
if record.name == locus:
subrecord = slice_range(record, start, end)
if subrecord:
modified_records.append(subrecord)
# Write modified records to the specified output GenBank file
with open(args.outfile, "w") as output_handle:
SeqIO.write(modified_records, output_handle, "genbank")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment