dalloliogm/get_genotypes.py

## get_genotypes.py
#!/usr/bin/env python
"""
Given a file containing a table of genes and their coordinates (see "Inputs"), get their genotypes from 1000genomes.

This in reality is a wrapper to tabix. You need to have tabix installed in the .bin folder, and you also need to have a ./data/tabix_indexes folder on your file system.

Usage
=========

   $: python get_genotypes.py -l data/n-glycan.coords


Inputs
==========

the coordinates file must contain the coordinates of each gene in the analysis, in the following format:

::

    DOLK    N-glycan        substrates      chr9    131707808       131710012
    RPN1    N-glycan        ost_complex     chr3    128338812       128399918
    ALG8    N-glycan        precursor_biosynthesis  chr11   77811987        77850699
    ALG9    N-glycan        precursor_biosynthesis  chr11   111652918       111750181
    UAP1    N-glycan        substrates      chr1    162531295       162569633
    ST6GAL1 N-glycan        branching2      chr3    186648314       186796341
    ALG2    N-glycan        precursor_biosynthesis  chr9    101978706       101984246
    ALG3    N-glycan        precursor_biosynthesis  chr3    183960116       183967313
    ALG1    N-glycan        precursor_biosynthesis  chr16   5083702 5137380
    ALG6    N-glycan        precursor_biosynthesis  chr1    63833260        63904233
    GANAB   N-glycan        cnx_crt chr11   62392297        62414104

Note that SNPs in the 1000genomes project are encoded to hg19, so the coordinates must be all refSeq hg19. You can use the script get_gene_coords.py to retrieve the coordinates automatically.


"""
import optparse
import subprocess
import os


def get_options():
    parser = optparse.OptionParser(usage="python get_genotypes.py -l genes_coords.txt")
    parser.add_option('-l', '-g', '-f', '--list', '--list_of_genes', '--genes', dest='genes',
            help='file containing the list of gene. One symbol per line', default=False)
    parser.add_option('-i', '--index', '--force-generate-index', dest='force_index', action='store_true',
            help='Force generation of tabix index file', default=False)
    parser.add_option('-d', '--download', '--force-download', dest='force_download', action='store_true',
            help='Force download of vcf files', default=False)
    parser.add_option('-u', '--debug', dest='debug', action='store_true', default=True)
    (options, args) = parser.parse_args()

    if options.debug is True:
        logging.basicConfig(level=logging.DEBUG, filename='get_genotypes.log')

    if (options.genes is False) and (len(args) == 0):
        parser.print_help()
        parser.error('get_genotypes.py: genes file not defined.')
#    print args
#    print options.genes
    try:
        genes_path = ''
        if options.genes is not False:
            genes_path = options.genes
        elif args != '':
            genes_path = args[0]
#        print genes_path
        if genes_path != '': #TODO: I don't remember why I put this instruction here.
            genes_h = open(genes_path, 'r')
    except:
        print __doc__
        parser.error("Can not open genes file")

    genes = read_genes_file(genes_path)
    logging.debug(genes[:2])

    return (genes, options)

def read_genes_file(genes_path):
    """
    DOLK    N-glycan        substrates      chr9    131707808       131710012
    RPN1    N-glycan        ost_complex     chr3    128338812       128399918
    ALG8    N-glycan        precursor_biosynthesis  chr11   77811987        77850699
    ALG9    N-glycan        precursor_biosynthesis  chr11   111652918       111750181
    UAP1    N-glycan        substrates      chr1    162531295       162569633
    ST6GAL1 N-glycan        branching2      chr3    186648314       186796341
    ALG2    N-glycan        precursor_biosynthesis  chr9    101978706       101984246
    ALG3    N-glycan        precursor_biosynthesis  chr3    183960116       183967313
    ALG1    N-glycan        precursor_biosynthesis  chr16   5083702 5137380
    ALG6    N-glycan        precursor_biosynthesis  chr1    63833260        63904233
    GANAB   N-glycan        cnx_crt chr11   62392297        62414104

    """
    genes = [line.split() for line in open(genes_path, 'r')]
    return genes


def get_genotypes(genes, force_index=False, force_download=False, get_individuals=False):
    """
    Call tabix on 1000genomes server to get coordinates.

    Note: the URL to 1000genomes may change over time. If something doesn't work, check it.
    """
#    tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz"
    tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr{0}.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
    individuals_file = "phase1_integrated_calls.20101123.ALL.panel"
#    ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel
    individuals_url = tabix_1000genomes_url.rsplit('/', 1)[0] + '/' + individuals_file

    # Get individuals' info
    if get_individuals is True:
        print "\nDownloading individual infos\n"
        print("wget {}".format(individuals_url))
        subprocess.call("wget {}".format(individuals_url).split())
        print("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file))
        subprocess.call("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file).split())

#    report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\ttot_snps\tgene_length\tsnp_per_kb\n'
    report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\tgene_length\n'
    for fields in genes:
        (gene,pathway,subpathway,chromosome,start,end) = fields[:6]

        chromosome = chromosome.replace('chr', '')
        command = "./bin/tabix -h {0} {1}:{2}-{3}".format(tabix_1000genomes_url.format(chromosome), chromosome, start, end)
        print "\nDownloading genotypes for gene {}, {}:{}-{}".format(gene, chromosome, start,end)
        outputfilepath = './data/vcf/' + gene + '.vcf'
        gzoutputfilepath = './data/vcf/' + gene + '.vcf.gz'
        if (os.path.isfile(gzoutputfilepath)) and (os.stat(gzoutputfilepath).st_size > 0) and (force_download is not True):
            print "- skipping download, as genotypes for gene {} are already available in data/vcf/{}.vcf. Use --force-download option to download again.".format(gene, gene)
        else:
            print "- executing command: " + command
            outputfile = open(outputfilepath, 'w')
            subprocess.call(command.split(), cwd='./data/tabix_indexes', stdout=outputfile)
            subprocess.call(["gzip", "-f", outputfile.name])

        # Calculate how many SNPs correspond to the gene, and update Table S1
        gene_length = int(end) - int(start)
#        number_snps = len(open(outputfilepath,'r').readlines()) - 30
#        if number_snps <= 0:
#            number_snps = 'NA'
#            density = 'NA'
#        else:
#            density = 1000. * number_snps / gene_length

#        report_table_content += "{}\t{}\t{}\t{}\n".format('\t'.join(fields), number_snps, gene_length, density)
        report_table_content += "{}\t{}\n".format('\t'.join(fields), gene_length)


    return report_table_content

def main():
    (genes, options) = get_options()
    report_table_content = get_genotypes(genes, options.force_index, options.force_download)
    print report_table_content
    pathway = options.genes.split('/')[-1].rsplit('.')[0]
    report_outputfilepath = "report_{}_number_snps.txt".format(pathway)
    report_outputfile = open(report_outputfilepath, 'w')
    report_outputfile.write(report_table_content)


if __name__ == '__main__':
    import logging
    import doctest
    logging.basicConfig(level=logging.DEBUG)
    doctest.testmod()
    main()
	#!/usr/bin/env python
	"""
	Given a file containing a table of genes and their coordinates (see "Inputs"), get their genotypes from 1000genomes.

	This in reality is a wrapper to tabix. You need to have tabix installed in the .bin folder, and you also need to have a ./data/tabix_indexes folder on your file system.

	Usage
	=========

	$: python get_genotypes.py -l data/n-glycan.coords


	Inputs
	==========

	the coordinates file must contain the coordinates of each gene in the analysis, in the following format:

	::

	DOLK N-glycan substrates chr9 131707808 131710012
	RPN1 N-glycan ost_complex chr3 128338812 128399918
	ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
	ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
	UAP1 N-glycan substrates chr1 162531295 162569633
	ST6GAL1 N-glycan branching2 chr3 186648314 186796341
	ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
	ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
	ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
	ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
	GANAB N-glycan cnx_crt chr11 62392297 62414104

	Note that SNPs in the 1000genomes project are encoded to hg19, so the coordinates must be all refSeq hg19. You can use the script get_gene_coords.py to retrieve the coordinates automatically.


	"""
	import optparse
	import subprocess
	import os


	def get_options():
	parser = optparse.OptionParser(usage="python get_genotypes.py -l genes_coords.txt")
	parser.add_option('-l', '-g', '-f', '--list', '--list_of_genes', '--genes', dest='genes',
	help='file containing the list of gene. One symbol per line', default=False)
	parser.add_option('-i', '--index', '--force-generate-index', dest='force_index', action='store_true',
	help='Force generation of tabix index file', default=False)
	parser.add_option('-d', '--download', '--force-download', dest='force_download', action='store_true',
	help='Force download of vcf files', default=False)
	parser.add_option('-u', '--debug', dest='debug', action='store_true', default=True)
	(options, args) = parser.parse_args()

	if options.debug is True:
	logging.basicConfig(level=logging.DEBUG, filename='get_genotypes.log')

	if (options.genes is False) and (len(args) == 0):
	parser.print_help()
	parser.error('get_genotypes.py: genes file not defined.')
	# print args
	# print options.genes
	try:
	genes_path = ''
	if options.genes is not False:
	genes_path = options.genes
	elif args != '':
	genes_path = args[0]
	# print genes_path
	if genes_path != '': #TODO: I don't remember why I put this instruction here.
	genes_h = open(genes_path, 'r')
	except:
	print __doc__
	parser.error("Can not open genes file")

	genes = read_genes_file(genes_path)
	logging.debug(genes[:2])

	return (genes, options)

	def read_genes_file(genes_path):
	"""
	DOLK N-glycan substrates chr9 131707808 131710012
	RPN1 N-glycan ost_complex chr3 128338812 128399918
	ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
	ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
	UAP1 N-glycan substrates chr1 162531295 162569633
	ST6GAL1 N-glycan branching2 chr3 186648314 186796341
	ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
	ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
	ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
	ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
	GANAB N-glycan cnx_crt chr11 62392297 62414104

	"""
	genes = [line.split() for line in open(genes_path, 'r')]
	return genes


	def get_genotypes(genes, force_index=False, force_download=False, get_individuals=False):
	"""
	Call tabix on 1000genomes server to get coordinates.

	Note: the URL to 1000genomes may change over time. If something doesn't work, check it.
	"""
	# tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz"
	tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr{0}.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
	individuals_file = "phase1_integrated_calls.20101123.ALL.panel"
	# ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel
	individuals_url = tabix_1000genomes_url.rsplit('/', 1)[0] + '/' + individuals_file

	# Get individuals' info
	if get_individuals is True:
	print "\nDownloading individual infos\n"
	print("wget {}".format(individuals_url))
	subprocess.call("wget {}".format(individuals_url).split())
	print("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file))
	subprocess.call("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file).split())

	# report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\ttot_snps\tgene_length\tsnp_per_kb\n'
	report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\tgene_length\n'
	for fields in genes:
	(gene,pathway,subpathway,chromosome,start,end) = fields[:6]

	chromosome = chromosome.replace('chr', '')
	command = "./bin/tabix -h {0} {1}:{2}-{3}".format(tabix_1000genomes_url.format(chromosome), chromosome, start, end)
	print "\nDownloading genotypes for gene {}, {}:{}-{}".format(gene, chromosome, start,end)
	outputfilepath = './data/vcf/' + gene + '.vcf'
	gzoutputfilepath = './data/vcf/' + gene + '.vcf.gz'
	if (os.path.isfile(gzoutputfilepath)) and (os.stat(gzoutputfilepath).st_size > 0) and (force_download is not True):
	print "- skipping download, as genotypes for gene {} are already available in data/vcf/{}.vcf. Use --force-download option to download again.".format(gene, gene)
	else:
	print "- executing command: " + command
	outputfile = open(outputfilepath, 'w')
	subprocess.call(command.split(), cwd='./data/tabix_indexes', stdout=outputfile)
	subprocess.call(["gzip", "-f", outputfile.name])

	# Calculate how many SNPs correspond to the gene, and update Table S1
	gene_length = int(end) - int(start)
	# number_snps = len(open(outputfilepath,'r').readlines()) - 30
	# if number_snps <= 0:
	# number_snps = 'NA'
	# density = 'NA'
	# else:
	# density = 1000. * number_snps / gene_length

	# report_table_content += "{}\t{}\t{}\t{}\n".format('\t'.join(fields), number_snps, gene_length, density)
	report_table_content += "{}\t{}\n".format('\t'.join(fields), gene_length)


	return report_table_content

	def main():
	(genes, options) = get_options()
	report_table_content = get_genotypes(genes, options.force_index, options.force_download)
	print report_table_content
	pathway = options.genes.split('/')[-1].rsplit('.')[0]
	report_outputfilepath = "report_{}_number_snps.txt".format(pathway)
	report_outputfile = open(report_outputfilepath, 'w')
	report_outputfile.write(report_table_content)


	if __name__ == '__main__':
	import logging
	import doctest
	logging.basicConfig(level=logging.DEBUG)
	doctest.testmod()
	main()