Created
July 11, 2012 09:35
-
-
Save dalloliogm/3089297 to your computer and use it in GitHub Desktop.
get_snpannotation_from_biomart.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/!usr/bin/env Rscript | |
description = " | |
Given a list of SNPs, get their disease-association status from Biomart. | |
++++++++++++++++++ | |
Parameters | |
++++++++++++++++++ | |
-p snp_list | |
The snp_list file is a file containing the list of SNPs to analyze, one per line. | |
Example: | |
:: | |
rs10000 | |
rs20000 | |
rs13323 | |
rs31231 | |
See also the file data/snps/all_pathways.snps for an example of input file. | |
-o output_file | |
The output file is the name of the file where you want to save the results. | |
This will be a csv file containing one column for each value in defined in the 'atts' variable. | |
" | |
# NOTE: to install biomaRt: | |
# source("http://bioconductor.org/biocLite.R") | |
# biocLite("biomaRt") | |
require(biomaRt) | |
require(optparse) | |
option_list <- list( | |
make_option( | |
c("--snp_list", "-p"), | |
default="data/snps/all_pathways.snps", | |
dest="snp_list", type="character", | |
help="File the list of SNPs to analyze, one per line. [default %default]" | |
), | |
make_option( | |
c("--outputfile", "-o"), | |
default="results/tables/all_pathways_snps_annotations.csv", dest="outputfile", | |
help="Output file name [default %default]") | |
) | |
arg_values = parse_args(OptionParser(option_list = option_list, usage=description)) | |
# Atts: list of attributes to download from biomart | |
atts = c( | |
"refsnp_id", | |
"chr_name", | |
"chrom_start", | |
"chrom_strand", | |
"allele", | |
"minor_allele", | |
"minor_allele_freq", | |
"clinical_significance", | |
"phenotype_description", | |
"associated_gene", | |
"phenotype_name", | |
"associated_variant_risk_allele", | |
"consequence_type_tv", | |
"consequence_allele_string", | |
"ensembl_type", | |
"ensembl_gene_stable_id", | |
"polyphen_prediction", | |
"sift_prediction", | |
"validated" | |
) | |
filts = "snp_filter" | |
snpmart = useMart("snp") | |
snpdataset = useDataset("hsapiens_snp", mart=snpmart) | |
snps = read.table(arg_values$snp_list, header=F) | |
# Since there are too many snps, split the dataset in chunks and call getBM multiple times | |
x <- seq_along(snps$V1) | |
snps.split = split(snps$V1, ceiling(x/500)) | |
#snps.split = split(snps$V1, ceiling(x/5)) | |
# initialize an empty data.frame to store results | |
biomart_results = data.frame(matrix(ncol=length(atts))) | |
names(biomart_results) = atts | |
print(paste("starting download at", Sys.time())) | |
counter = 1 | |
# Download annotations, 500 SNPs at a time | |
for (snp_subset in snps.split) { | |
print(paste("getting chunk", counter, "of", length(snps.split), Sys.time())) | |
counter = counter + 1 | |
biomart_results = rbind(biomart_results, getBM(attributes = atts, filters=filts, values=snp_subset, mart=snpdataset)) | |
Sys.sleep(2) | |
} | |
biomart_results = biomart_results[2:nrow(biomart_results),] | |
print(head(biomart_results)) | |
# Write output table | |
write.table(biomart_results, arg_values$outputfile, row.names=F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment