dalloliogm/get_snpannotation_from_biomart.R

## get_snpannotation_from_biomart.R
#/!usr/bin/env Rscript
description = "

    Given a list of SNPs, get their disease-association status from Biomart.


    ++++++++++++++++++
    Parameters
    ++++++++++++++++++

    -p snp_list
        The snp_list file is a file containing the list of SNPs to analyze, one per line.

        Example:

        ::

            rs10000
            rs20000
            rs13323
            rs31231

    See also the file data/snps/all_pathways.snps for an example of input file.

    -o output_file
        The output file is the name of the file where you want to save the results.

        This will be a csv file containing one column for each value in defined in the 'atts' variable.


"
# NOTE: to install biomaRt:
# source("http://bioconductor.org/biocLite.R")
# biocLite("biomaRt")

require(biomaRt)
require(optparse)
option_list <- list(
    make_option(
        c("--snp_list", "-p"),
        default="data/snps/all_pathways.snps",
        dest="snp_list", type="character",
        help="File the list of SNPs to analyze, one per line. [default %default]"
        ),
    make_option(
        c("--outputfile", "-o"),
        default="results/tables/all_pathways_snps_annotations.csv", dest="outputfile",
        help="Output file name [default %default]")
)
arg_values = parse_args(OptionParser(option_list = option_list, usage=description))

# Atts: list of attributes to download from biomart
atts = c(
        "refsnp_id",
        "chr_name",
        "chrom_start",
        "chrom_strand",
        "allele",
        "minor_allele",
        "minor_allele_freq",
        "clinical_significance",
        "phenotype_description",
        "associated_gene",
        "phenotype_name",
        "associated_variant_risk_allele",
        "consequence_type_tv",
        "consequence_allele_string",
        "ensembl_type",
        "ensembl_gene_stable_id",
        "polyphen_prediction",
        "sift_prediction",
        "validated"
        )
filts = "snp_filter"

snpmart = useMart("snp")
snpdataset = useDataset("hsapiens_snp", mart=snpmart)

snps = read.table(arg_values$snp_list, header=F)

# Since there are too many snps, split the dataset in chunks and call getBM multiple times
x <- seq_along(snps$V1)
snps.split = split(snps$V1, ceiling(x/500))
#snps.split = split(snps$V1, ceiling(x/5))

# initialize an empty data.frame to store results
biomart_results = data.frame(matrix(ncol=length(atts)))
names(biomart_results) = atts
print(paste("starting download at", Sys.time()))
counter = 1

# Download annotations, 500 SNPs at a time
for (snp_subset in snps.split) {
    print(paste("getting chunk", counter, "of", length(snps.split), Sys.time()))
    counter = counter + 1
    biomart_results = rbind(biomart_results, getBM(attributes = atts, filters=filts, values=snp_subset, mart=snpdataset))
    Sys.sleep(2)
    }
biomart_results = biomart_results[2:nrow(biomart_results),]
print(head(biomart_results))

# Write output table
write.table(biomart_results, arg_values$outputfile, row.names=F)
	#/!usr/bin/env Rscript
	description = "

	Given a list of SNPs, get their disease-association status from Biomart.


	++++++++++++++++++
	Parameters
	++++++++++++++++++

	-p snp_list
	The snp_list file is a file containing the list of SNPs to analyze, one per line.

	Example:

	::

	rs10000
	rs20000
	rs13323
	rs31231

	See also the file data/snps/all_pathways.snps for an example of input file.

	-o output_file
	The output file is the name of the file where you want to save the results.

	This will be a csv file containing one column for each value in defined in the 'atts' variable.


	"
	# NOTE: to install biomaRt:
	# source("http://bioconductor.org/biocLite.R")
	# biocLite("biomaRt")

	require(biomaRt)
	require(optparse)
	option_list <- list(
	make_option(
	c("--snp_list", "-p"),
	default="data/snps/all_pathways.snps",
	dest="snp_list", type="character",
	help="File the list of SNPs to analyze, one per line. [default %default]"
	),
	make_option(
	c("--outputfile", "-o"),
	default="results/tables/all_pathways_snps_annotations.csv", dest="outputfile",
	help="Output file name [default %default]")
	)
	arg_values = parse_args(OptionParser(option_list = option_list, usage=description))

	# Atts: list of attributes to download from biomart
	atts = c(
	"refsnp_id",
	"chr_name",
	"chrom_start",
	"chrom_strand",
	"allele",
	"minor_allele",
	"minor_allele_freq",
	"clinical_significance",
	"phenotype_description",
	"associated_gene",
	"phenotype_name",
	"associated_variant_risk_allele",
	"consequence_type_tv",
	"consequence_allele_string",
	"ensembl_type",
	"ensembl_gene_stable_id",
	"polyphen_prediction",
	"sift_prediction",
	"validated"
	)
	filts = "snp_filter"

	snpmart = useMart("snp")
	snpdataset = useDataset("hsapiens_snp", mart=snpmart)

	snps = read.table(arg_values$snp_list, header=F)

	# Since there are too many snps, split the dataset in chunks and call getBM multiple times
	x <- seq_along(snps$V1)
	snps.split = split(snps$V1, ceiling(x/500))
	#snps.split = split(snps$V1, ceiling(x/5))

	# initialize an empty data.frame to store results
	biomart_results = data.frame(matrix(ncol=length(atts)))
	names(biomart_results) = atts
	print(paste("starting download at", Sys.time()))
	counter = 1

	# Download annotations, 500 SNPs at a time
	for (snp_subset in snps.split) {
	print(paste("getting chunk", counter, "of", length(snps.split), Sys.time()))
	counter = counter + 1
	biomart_results = rbind(biomart_results, getBM(attributes = atts, filters=filts, values=snp_subset, mart=snpdataset))
	Sys.sleep(2)
	}
	biomart_results = biomart_results[2:nrow(biomart_results),]
	print(head(biomart_results))

	# Write output table
	write.table(biomart_results, arg_values$outputfile, row.names=F)