Created
June 18, 2012 16:05
-
-
Save dalloliogm/2949123 to your computer and use it in GitHub Desktop.
get genotypes from 1000genomes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Given a file containing a table of genes and their coordinates (see "Inputs"), get their genotypes from 1000genomes. | |
This in reality is a wrapper to tabix. You need to have tabix installed in the .bin folder, and you also need to have a ./data/tabix_indexes folder on your file system. | |
Usage | |
========= | |
$: python get_genotypes.py -l data/n-glycan.coords | |
Inputs | |
========== | |
the coordinates file must contain the coordinates of each gene in the analysis, in the following format: | |
:: | |
DOLK N-glycan substrates chr9 131707808 131710012 | |
RPN1 N-glycan ost_complex chr3 128338812 128399918 | |
ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699 | |
ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181 | |
UAP1 N-glycan substrates chr1 162531295 162569633 | |
ST6GAL1 N-glycan branching2 chr3 186648314 186796341 | |
ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246 | |
ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313 | |
ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380 | |
ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233 | |
GANAB N-glycan cnx_crt chr11 62392297 62414104 | |
Note that SNPs in the 1000genomes project are encoded to hg19, so the coordinates must be all refSeq hg19. You can use the script get_gene_coords.py to retrieve the coordinates automatically. | |
""" | |
import optparse | |
import subprocess | |
import os | |
def get_options(): | |
parser = optparse.OptionParser(usage="python get_genotypes.py -l genes_coords.txt") | |
parser.add_option('-l', '-g', '-f', '--list', '--list_of_genes', '--genes', dest='genes', | |
help='file containing the list of gene. One symbol per line', default=False) | |
parser.add_option('-i', '--index', '--force-generate-index', dest='force_index', action='store_true', | |
help='Force generation of tabix index file', default=False) | |
parser.add_option('-d', '--download', '--force-download', dest='force_download', action='store_true', | |
help='Force download of vcf files', default=False) | |
parser.add_option('-u', '--debug', dest='debug', action='store_true', default=True) | |
(options, args) = parser.parse_args() | |
if options.debug is True: | |
logging.basicConfig(level=logging.DEBUG, filename='get_genotypes.log') | |
if (options.genes is False) and (len(args) == 0): | |
parser.print_help() | |
parser.error('get_genotypes.py: genes file not defined.') | |
# print args | |
# print options.genes | |
try: | |
genes_path = '' | |
if options.genes is not False: | |
genes_path = options.genes | |
elif args != '': | |
genes_path = args[0] | |
# print genes_path | |
if genes_path != '': #TODO: I don't remember why I put this instruction here. | |
genes_h = open(genes_path, 'r') | |
except: | |
print __doc__ | |
parser.error("Can not open genes file") | |
genes = read_genes_file(genes_path) | |
logging.debug(genes[:2]) | |
return (genes, options) | |
def read_genes_file(genes_path): | |
""" | |
DOLK N-glycan substrates chr9 131707808 131710012 | |
RPN1 N-glycan ost_complex chr3 128338812 128399918 | |
ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699 | |
ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181 | |
UAP1 N-glycan substrates chr1 162531295 162569633 | |
ST6GAL1 N-glycan branching2 chr3 186648314 186796341 | |
ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246 | |
ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313 | |
ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380 | |
ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233 | |
GANAB N-glycan cnx_crt chr11 62392297 62414104 | |
""" | |
genes = [line.split() for line in open(genes_path, 'r')] | |
return genes | |
def get_genotypes(genes, force_index=False, force_download=False, get_individuals=False): | |
""" | |
Call tabix on 1000genomes server to get coordinates. | |
Note: the URL to 1000genomes may change over time. If something doesn't work, check it. | |
""" | |
# tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz" | |
tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr{0}.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz" | |
individuals_file = "phase1_integrated_calls.20101123.ALL.panel" | |
# ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel | |
individuals_url = tabix_1000genomes_url.rsplit('/', 1)[0] + '/' + individuals_file | |
# Get individuals' info | |
if get_individuals is True: | |
print "\nDownloading individual infos\n" | |
print("wget {}".format(individuals_url)) | |
subprocess.call("wget {}".format(individuals_url).split()) | |
print("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file)) | |
subprocess.call("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file).split()) | |
# report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\ttot_snps\tgene_length\tsnp_per_kb\n' | |
report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\tgene_length\n' | |
for fields in genes: | |
(gene,pathway,subpathway,chromosome,start,end) = fields[:6] | |
chromosome = chromosome.replace('chr', '') | |
command = "./bin/tabix -h {0} {1}:{2}-{3}".format(tabix_1000genomes_url.format(chromosome), chromosome, start, end) | |
print "\nDownloading genotypes for gene {}, {}:{}-{}".format(gene, chromosome, start,end) | |
outputfilepath = './data/vcf/' + gene + '.vcf' | |
gzoutputfilepath = './data/vcf/' + gene + '.vcf.gz' | |
if (os.path.isfile(gzoutputfilepath)) and (os.stat(gzoutputfilepath).st_size > 0) and (force_download is not True): | |
print "- skipping download, as genotypes for gene {} are already available in data/vcf/{}.vcf. Use --force-download option to download again.".format(gene, gene) | |
else: | |
print "- executing command: " + command | |
outputfile = open(outputfilepath, 'w') | |
subprocess.call(command.split(), cwd='./data/tabix_indexes', stdout=outputfile) | |
subprocess.call(["gzip", "-f", outputfile.name]) | |
# Calculate how many SNPs correspond to the gene, and update Table S1 | |
gene_length = int(end) - int(start) | |
# number_snps = len(open(outputfilepath,'r').readlines()) - 30 | |
# if number_snps <= 0: | |
# number_snps = 'NA' | |
# density = 'NA' | |
# else: | |
# density = 1000. * number_snps / gene_length | |
# report_table_content += "{}\t{}\t{}\t{}\n".format('\t'.join(fields), number_snps, gene_length, density) | |
report_table_content += "{}\t{}\n".format('\t'.join(fields), gene_length) | |
return report_table_content | |
def main(): | |
(genes, options) = get_options() | |
report_table_content = get_genotypes(genes, options.force_index, options.force_download) | |
print report_table_content | |
pathway = options.genes.split('/')[-1].rsplit('.')[0] | |
report_outputfilepath = "report_{}_number_snps.txt".format(pathway) | |
report_outputfile = open(report_outputfilepath, 'w') | |
report_outputfile.write(report_table_content) | |
if __name__ == '__main__': | |
import logging | |
import doctest | |
logging.basicConfig(level=logging.DEBUG) | |
doctest.testmod() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment