-
-
Save bananabenana/20ff257f237d5a6e6f449fd7066577a1 to your computer and use it in GitHub Desktop.
Code to 'slice' or subset a multi genbank file using gene name or coordinates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Slices/subsections genbank files by either gene names or base pair range. This handles multi-genbank files which have multiple contigs | |
# Usage: | |
# slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk # This will extract the region between two genes including these genes | |
# slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n 2 -o gene_sliced_output.gbk # This will extract 2 genes up- and down-stream of your specified two genes | |
# slice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk # This will extract a single gene | |
# slice_multi_genbank.py -i infile.gbk -g gene1: -n 3 -o gene_extract_output.gbk # This will extract a single gene and 3 genes up- and down-stream of the gene | |
# slice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk # This will slice via base pair range and locus/contig. This is required to handle multi-genbank files | |
# Inputs: | |
# 1) Genbank file [.gbk, .gbff] | |
# 2) One of: | |
# - Two gene names | |
# - Two base pair range and locus | |
# Requirements: | |
# - biopython (any) | |
# Author: Ben Vezina | |
# - Scholar: https://scholar.google.com/citations?user=Rf9oh94AAAAJ&hl=en&oi=ao | |
# - ORCID: https://orcid.org/0000-0003-4224-2537 | |
# Citation: https://gist.github.com/bananabenana/20ff257f237d5a6e6f449fd7066577a1 | |
# Adapted from https://gist.github.com/jrjhealey/2df3c65c7a70cbca4862e94620e4d7b2 | |
from Bio import SeqIO | |
import sys, argparse | |
from argparse import RawTextHelpFormatter | |
def get_args(): | |
"""Parse command line arguments""" | |
try: | |
parser = argparse.ArgumentParser( | |
description=''' | |
description: | |
Subset genbanks between 1 gene, 2 genes or base pair ranges.\n | |
usage: | |
To slice/extract by gene names: | |
\tslice_multi_genbank.py -i infile.gbk -g gene1:gene2 -o gene_sliced_output.gbk | |
To slice/extract by gene name +- a number of genes up- and down-stream: | |
slice_multi_genbank.py -i infile.gbk -g gene1:gene2 -n [integer] -o gene_sliced_output.gbk | |
To extract a gene by its name: | |
\tslice_multi_genbank.py -i infile.gbk -g gene1: -o gene_extract_output.gbk | |
To extract a gene by its name +- a number of genes up- and down-stream: | |
\tslice_multi_genbank.py -i infile.gbk -g gene1: -n [integer] -o gene_extract_output.gbk | |
To slice by base pair range coordinates: | |
\tslice_multi_genbank.py -i infile.gbk -r start:end:contig -o range_sliced_output.gbk''', formatter_class=RawTextHelpFormatter) | |
parser.add_argument('-i', '--infile', action='store', | |
help='Input genbank [.gbk, .gbff] file to slice or subsection. Can be a multi-genbank file with multiple contigs') | |
parser.add_argument('-g', '--genes', action='store', | |
help='''The two gene names to slice between. Example: -g hemF:maeB | |
A single gene can be extracted using one gene name. Example: -g hemF:''') | |
parser.add_argument('-n', '--num_genes', help="Number of genes to slice before and after the selected genes. Can be used with -g only. Optional. Example: -n 2", type=int, default=0) | |
parser.add_argument('-r', '--range', action='store', | |
help='The two base pair coordinates (range) to slice between. Contig name or locus must be provided. Example: 612899:630276:contig_1') | |
parser.add_argument("-o", "--outfile", required=True, help="Output GenBank filename") | |
if len(sys.argv) == 1: | |
parser.print_help(sys.stderr) | |
exit(1) | |
except: | |
sys.stderr.write("An exception occurred with argument parsing. Check your provided options.") | |
return parser.parse_args() | |
def slice_gene(record, genes, num_genes): | |
loci = [feat for feat in record.features if feat.type == "CDS"] | |
try: | |
gene_start = min([int(l.location.start) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes]) | |
gene_end = max([int(l.location.end) for l in loci if 'gene' in l.qualifiers and l.qualifiers['gene'][0] in genes]) | |
# Calculate slicing range using index from loci | |
start_locus = loci[min([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) - num_genes] | |
end_locus = loci[max([idx for idx, feat in enumerate(loci) if 'gene' in feat.qualifiers and feat.qualifiers['gene'][0] in genes]) + num_genes] | |
start = max(0, int(start_locus.location.start)) | |
end = min(len(record), int(end_locus.location.end)) | |
subrecord = record[start:end] | |
print(f"Extracted {genes[0]}:{genes[1]} with {num_genes} genes up/down from {record.id}") | |
return subrecord | |
except ValueError: | |
return None | |
def slice_range(record, start, end): | |
subrecord = record[int(start):int(end)] | |
print(f"Extracted {start}:{end} from {record.id}") | |
return subrecord | |
def main(): | |
args = get_args() | |
# Check that both -g and -r options are not used simultaneously | |
if args.genes and args.range: | |
sys.stderr.write("Error: Both -g and -r options cannot be used simultaneously.\n") | |
exit(1) | |
modified_records = [] | |
records = SeqIO.parse(args.infile, 'genbank') | |
for record in records: | |
sys.stderr.write('Searching {}.\n'.format(record.id)) | |
if args.genes: | |
subrecord = slice_gene(record, args.genes.split(':'), args.num_genes) | |
if subrecord: | |
modified_records.append(subrecord) | |
elif args.range: | |
range_parts = args.range.split(':') | |
if len(range_parts) != 3: | |
sys.stderr.write("Error: When using -r option, provide start:end:contig.\n") | |
exit(1) | |
start, end, locus = range_parts[0], range_parts[1], range_parts[2] | |
if record.name == locus: | |
subrecord = slice_range(record, start, end) | |
if subrecord: | |
modified_records.append(subrecord) | |
# Write modified records to the specified output GenBank file | |
with open(args.outfile, "w") as output_handle: | |
SeqIO.write(modified_records, output_handle, "genbank") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment