Skip to content

Instantly share code, notes, and snippets.

@dalloliogm
Created June 18, 2012 16:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dalloliogm/2949123 to your computer and use it in GitHub Desktop.
Save dalloliogm/2949123 to your computer and use it in GitHub Desktop.
get genotypes from 1000genomes
#!/usr/bin/env python
"""
Given a file containing a table of genes and their coordinates (see "Inputs"), get their genotypes from 1000genomes.
This in reality is a wrapper to tabix. You need to have tabix installed in the .bin folder, and you also need to have a ./data/tabix_indexes folder on your file system.
Usage
=========
$: python get_genotypes.py -l data/n-glycan.coords
Inputs
==========
the coordinates file must contain the coordinates of each gene in the analysis, in the following format:
::
DOLK N-glycan substrates chr9 131707808 131710012
RPN1 N-glycan ost_complex chr3 128338812 128399918
ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
UAP1 N-glycan substrates chr1 162531295 162569633
ST6GAL1 N-glycan branching2 chr3 186648314 186796341
ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
GANAB N-glycan cnx_crt chr11 62392297 62414104
Note that SNPs in the 1000genomes project are encoded to hg19, so the coordinates must be all refSeq hg19. You can use the script get_gene_coords.py to retrieve the coordinates automatically.
"""
import optparse
import subprocess
import os
def get_options():
parser = optparse.OptionParser(usage="python get_genotypes.py -l genes_coords.txt")
parser.add_option('-l', '-g', '-f', '--list', '--list_of_genes', '--genes', dest='genes',
help='file containing the list of gene. One symbol per line', default=False)
parser.add_option('-i', '--index', '--force-generate-index', dest='force_index', action='store_true',
help='Force generation of tabix index file', default=False)
parser.add_option('-d', '--download', '--force-download', dest='force_download', action='store_true',
help='Force download of vcf files', default=False)
parser.add_option('-u', '--debug', dest='debug', action='store_true', default=True)
(options, args) = parser.parse_args()
if options.debug is True:
logging.basicConfig(level=logging.DEBUG, filename='get_genotypes.log')
if (options.genes is False) and (len(args) == 0):
parser.print_help()
parser.error('get_genotypes.py: genes file not defined.')
# print args
# print options.genes
try:
genes_path = ''
if options.genes is not False:
genes_path = options.genes
elif args != '':
genes_path = args[0]
# print genes_path
if genes_path != '': #TODO: I don't remember why I put this instruction here.
genes_h = open(genes_path, 'r')
except:
print __doc__
parser.error("Can not open genes file")
genes = read_genes_file(genes_path)
logging.debug(genes[:2])
return (genes, options)
def read_genes_file(genes_path):
"""
DOLK N-glycan substrates chr9 131707808 131710012
RPN1 N-glycan ost_complex chr3 128338812 128399918
ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
UAP1 N-glycan substrates chr1 162531295 162569633
ST6GAL1 N-glycan branching2 chr3 186648314 186796341
ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
GANAB N-glycan cnx_crt chr11 62392297 62414104
"""
genes = [line.split() for line in open(genes_path, 'r')]
return genes
def get_genotypes(genes, force_index=False, force_download=False, get_individuals=False):
"""
Call tabix on 1000genomes server to get coordinates.
Note: the URL to 1000genomes may change over time. If something doesn't work, check it.
"""
# tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz"
tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr{0}.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
individuals_file = "phase1_integrated_calls.20101123.ALL.panel"
# ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel
individuals_url = tabix_1000genomes_url.rsplit('/', 1)[0] + '/' + individuals_file
# Get individuals' info
if get_individuals is True:
print "\nDownloading individual infos\n"
print("wget {}".format(individuals_url))
subprocess.call("wget {}".format(individuals_url).split())
print("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file))
subprocess.call("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file).split())
# report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\ttot_snps\tgene_length\tsnp_per_kb\n'
report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\tgene_length\n'
for fields in genes:
(gene,pathway,subpathway,chromosome,start,end) = fields[:6]
chromosome = chromosome.replace('chr', '')
command = "./bin/tabix -h {0} {1}:{2}-{3}".format(tabix_1000genomes_url.format(chromosome), chromosome, start, end)
print "\nDownloading genotypes for gene {}, {}:{}-{}".format(gene, chromosome, start,end)
outputfilepath = './data/vcf/' + gene + '.vcf'
gzoutputfilepath = './data/vcf/' + gene + '.vcf.gz'
if (os.path.isfile(gzoutputfilepath)) and (os.stat(gzoutputfilepath).st_size > 0) and (force_download is not True):
print "- skipping download, as genotypes for gene {} are already available in data/vcf/{}.vcf. Use --force-download option to download again.".format(gene, gene)
else:
print "- executing command: " + command
outputfile = open(outputfilepath, 'w')
subprocess.call(command.split(), cwd='./data/tabix_indexes', stdout=outputfile)
subprocess.call(["gzip", "-f", outputfile.name])
# Calculate how many SNPs correspond to the gene, and update Table S1
gene_length = int(end) - int(start)
# number_snps = len(open(outputfilepath,'r').readlines()) - 30
# if number_snps <= 0:
# number_snps = 'NA'
# density = 'NA'
# else:
# density = 1000. * number_snps / gene_length
# report_table_content += "{}\t{}\t{}\t{}\n".format('\t'.join(fields), number_snps, gene_length, density)
report_table_content += "{}\t{}\n".format('\t'.join(fields), gene_length)
return report_table_content
def main():
(genes, options) = get_options()
report_table_content = get_genotypes(genes, options.force_index, options.force_download)
print report_table_content
pathway = options.genes.split('/')[-1].rsplit('.')[0]
report_outputfilepath = "report_{}_number_snps.txt".format(pathway)
report_outputfile = open(report_outputfilepath, 'w')
report_outputfile.write(report_table_content)
if __name__ == '__main__':
import logging
import doctest
logging.basicConfig(level=logging.DEBUG)
doctest.testmod()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment