Skip to content

Instantly share code, notes, and snippets.

@jenzopr
Last active February 22, 2018 13:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jenzopr/f829702bfa045f410ae07d3ce5e140e8 to your computer and use it in GitHub Desktop.
Save jenzopr/f829702bfa045f410ae07d3ce5e140e8 to your computer and use it in GitHub Desktop.
A command line tool to join genomes and annotations from two or more species.
#!/usr/bin/env python
__author__ = 'Jens Preussner'
__license__ = 'MIT'
import argparse
import logging
import sys
if __name__ == "__main__":
#
# Set up arguments and logging
#
parser = argparse.ArgumentParser(
description='A command line tool to join genomes and annotations from two or more species.')
parser.add_argument('--fasta', '-f', required=True, nargs = 2, metavar = ('prefix', 'file.fasta'), action = 'append', help = 'Fasta files to join. The prefix will be used to match between fasta and gtf. Can be given multiple times.')
parser.add_argument('--gtf', '-a', required=True, nargs = 2, metavar = ('prefix', 'file.gtf'), action = 'append', help = 'GTF annotation files to join. The prefix will be used to match between fasta and gtf. Can be given multiple times.')
parser.add_argument('--merged-fasta', '-o', default = 'merged.fasta', metavar='merged.fasta', type = argparse.FileType('w'), help = 'Filename of the merged fasta file.')
parser.add_argument('--merged-gtf', '-p', default='merged.gtf', metavar='merged.gtf', type = argparse.FileType('w'), help = 'Filename of the merged GTF annotation.')
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
logger = logging.getLogger(__name__)
formatter = logging.Formatter('%(levelname)s - %(message)s')
if args.debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
#
# Validate inputs
#
fasta = dict(zip([f[0] for f in args.fasta], [f[1] for f in args.fasta]))
anno = dict(zip([f[0] for f in args.gtf], [f[1] for f in args.gtf]))
common_keys = set(fasta.keys()).intersection(anno.keys())
if len(common_keys) < 2:
union_keys = list(set().union(fasta.keys(), anno.keys()))
logger.error('Error: Fasta files and GTF annotation were provided for less than two common species prefixes: ')
for k in union_keys:
if k in fasta.keys() and not k in anno.keys():
logger.error('- GTF annotation for prefix {} missing.'.format(k))
if k not in fasta.keys() and k in anno.keys():
logger.error('- Fasta file for prefix {} missing.'.format(k))
sys.exit()
#
# Begin adding species prefix to fasta files
#
bufsize = 65536
logger.debug('Setting readlines buffersize to {}.'.format(bufsize))
for k in common_keys:
with open(fasta[k]) as infile:
logger.debug('Opened file {} for reading.'.format(fasta[k]))
while True:
lines = infile.readlines(bufsize)
if not lines:
break
for line in lines:
if line.startswith('>'):
args.merged_fasta.write('>' + k + '_' + line[1:])
else:
args.merged_fasta.write(line)
with open(anno[k]) as infile:
logger.debug('Opened file {} for reading.'.format(fasta[k]))
while True:
lines = infile.readlines(bufsize)
if not lines:
break
for line in lines:
# Skip comment lines in GTF files
if not line.startswith('#'):
args.merged_gtf.write(k + '_' + line)
@jenzopr
Copy link
Author

jenzopr commented Feb 22, 2018

Use it like this:

# Directory listing of input files
$ ls
gencode.v27.annotation.gtf gencode.vM16.annotation.gtf hg38.fa mm10.fa 
$ python path/to/joinGenomes.py --fasta hg38 hg38.fa --fasta mm10 mm10.fa --gtf hg38 gencode.v27.annotation.gtf --gtf mm10 gencode.vM16.annotation.gtf

# Writes merged.fasta and merged.gtf in less than 5 minutes on my PC
$ ls
gencode.v27.annotation.gtf gencode.vM16.annotation.gtf hg38.fa merged.fasta merged.gtf mm10.fa
$ 
$ head merged.gtf
mm10_chr1	HAVANA	gene	3073253	3074322	.	+	.	gene_id "ENSMUSG00000102693"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; havana_gene "OTTMUSG00000049935.1";
mm10_chr1	HAVANA	transcript	3073253	3074322	.	+	.	gene_id "ENSMUSG00000102693"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1";
$ head -n1 merged.fasta
>mm10_chr1

@jenzopr
Copy link
Author

jenzopr commented Feb 22, 2018

A help message is also provided:

$ python path/to/joinGenome.py -h
usage: joinGenomes.py [-h] --fasta prefix file.fasta --gtf prefix file.gtf
                      [--merged-fasta merged.fasta] [--merged-gtf merged.gtf]
                      [--debug]

A command line tool to join genomes and annotations from two or more species.

optional arguments:
  -h, --help            show this help message and exit
  --fasta prefix file.fasta, -f prefix file.fasta
                        Fasta files to join. The prefix will be used to match
                        between fasta and gtf. Can be given multiple times.
  --gtf prefix file.gtf, -a prefix file.gtf
                        GTF annotation files to join. The prefix will be used
                        to match between fasta and gtf. Can be given multiple
                        times.
  --merged-fasta merged.fasta, -o merged.fasta
                        Filename of the merged fasta file.
  --merged-gtf merged.gtf, -p merged.gtf
                        Filename of the merged GTF annotation.
  --debug

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment