Skip to content

Instantly share code, notes, and snippets.

@dalloliogm
Created July 11, 2012 09:35
Show Gist options
  • Save dalloliogm/3089297 to your computer and use it in GitHub Desktop.
Save dalloliogm/3089297 to your computer and use it in GitHub Desktop.
get_snpannotation_from_biomart.R
#/!usr/bin/env Rscript
description = "
Given a list of SNPs, get their disease-association status from Biomart.
++++++++++++++++++
Parameters
++++++++++++++++++
-p snp_list
The snp_list file is a file containing the list of SNPs to analyze, one per line.
Example:
::
rs10000
rs20000
rs13323
rs31231
See also the file data/snps/all_pathways.snps for an example of input file.
-o output_file
The output file is the name of the file where you want to save the results.
This will be a csv file containing one column for each value in defined in the 'atts' variable.
"
# NOTE: to install biomaRt:
# source("http://bioconductor.org/biocLite.R")
# biocLite("biomaRt")
require(biomaRt)
require(optparse)
option_list <- list(
make_option(
c("--snp_list", "-p"),
default="data/snps/all_pathways.snps",
dest="snp_list", type="character",
help="File the list of SNPs to analyze, one per line. [default %default]"
),
make_option(
c("--outputfile", "-o"),
default="results/tables/all_pathways_snps_annotations.csv", dest="outputfile",
help="Output file name [default %default]")
)
arg_values = parse_args(OptionParser(option_list = option_list, usage=description))
# Atts: list of attributes to download from biomart
atts = c(
"refsnp_id",
"chr_name",
"chrom_start",
"chrom_strand",
"allele",
"minor_allele",
"minor_allele_freq",
"clinical_significance",
"phenotype_description",
"associated_gene",
"phenotype_name",
"associated_variant_risk_allele",
"consequence_type_tv",
"consequence_allele_string",
"ensembl_type",
"ensembl_gene_stable_id",
"polyphen_prediction",
"sift_prediction",
"validated"
)
filts = "snp_filter"
snpmart = useMart("snp")
snpdataset = useDataset("hsapiens_snp", mart=snpmart)
snps = read.table(arg_values$snp_list, header=F)
# Since there are too many snps, split the dataset in chunks and call getBM multiple times
x <- seq_along(snps$V1)
snps.split = split(snps$V1, ceiling(x/500))
#snps.split = split(snps$V1, ceiling(x/5))
# initialize an empty data.frame to store results
biomart_results = data.frame(matrix(ncol=length(atts)))
names(biomart_results) = atts
print(paste("starting download at", Sys.time()))
counter = 1
# Download annotations, 500 SNPs at a time
for (snp_subset in snps.split) {
print(paste("getting chunk", counter, "of", length(snps.split), Sys.time()))
counter = counter + 1
biomart_results = rbind(biomart_results, getBM(attributes = atts, filters=filts, values=snp_subset, mart=snpdataset))
Sys.sleep(2)
}
biomart_results = biomart_results[2:nrow(biomart_results),]
print(head(biomart_results))
# Write output table
write.table(biomart_results, arg_values$outputfile, row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment