Created
September 11, 2019 07:25
-
-
Save bschiffthaler/d899730d23e3a32c194a1d52c92b8b2c to your computer and use it in GitHub Desktop.
Analyze your own SNP data from 23andMe using SNPedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(SNPediaR) | |
library(tidyverse) | |
# Original function has som issues with parsing that need fixing | |
getPages2 <- function (titles, verbose = FALSE, limit = 50, wikiParseFunction = identity, | |
baseURL, format, query, ...) | |
{ | |
if (missing(baseURL)) | |
baseURL <- "https://bots.snpedia.com/api.php" | |
if (missing(format)) | |
format <- "format=json" | |
if (missing(query)) | |
query <- "action=query&prop=revisions&rvprop=content&titles=" | |
baseURL <- paste0(baseURL, "?", format, "&", query) | |
Np <- length(titles) | |
Cp <- 0 | |
titles <- RCurl::curlEscape(titles) | |
n.batches <- 1 + length(titles)%/%limit | |
suppressWarnings(titles <- split(titles, 1:n.batches)) | |
titles <- sapply(titles, paste, collapse = "|") | |
res <- list() | |
for (tit in titles) { | |
pagesURL <- paste0(baseURL, tit) | |
if (verbose) { | |
Cp <- Cp + limit | |
cat(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "Downloading", | |
min(Cp, Np), "of", Np, "pages ...", pagesURL, | |
fill = TRUE) | |
} | |
curl_success = FALSE | |
while (! curl_success) | |
{ | |
tryCatch({ | |
datos <- RCurl::getURL(pagesURL) | |
datos <- gsub("\\n", "\\\\n", datos) | |
datos <- gsub("\\t", "\\\\t", datos) | |
datos <- gsub("\\\\\\\\x", "~x", datos) | |
datos <- gsub("\\\\x", "~x", datos) | |
datos <- gsub("\\| *", "\\|", datos) | |
datos <- gsub(" *= *", "=", datos) | |
datos <- gsub("w\\\\o", "without", datos) | |
datos <- jsonlite::fromJSON(datos) | |
datos <- datos[["query"]][["pages"]] | |
nombres <- sapply(datos, function(x) x[["title"]]) | |
datos <- lapply(datos, function(x) x[["revisions"]][["*"]]) | |
names(datos) <- nombres | |
datos <- lapply(datos, wikiParseFunction, ...) | |
res <- c(res, datos) | |
curl_success <- TRUE | |
}, finally = {}) | |
} | |
} | |
return(res) | |
} | |
# Needed to orient some SNPs in accordance with SNPedia. 23andme reports all SNPs | |
# on the plus strand, but SNPedia reports some on minus. | |
complement <- c('A' = 'T', 'C' = 'G', 'G' = 'C', 'T' = 'A', 'D' = 'D', '-' = '-', 'I' = 'I') | |
make_complement <- function(x, o) { | |
sapply(seq_along(x), function(i){ | |
y <- x[i] | |
if (o[i] == 'minus') { | |
paste(complement[unlist(str_split(y, ''))], collapse = '') | |
} else { | |
y | |
} | |
}) | |
} | |
# Get all pages in SNPedia with a genotype entry (takes a while) | |
snpedia_gts <- SNPediaR::getCategoryElements('Is_a_genotype') | |
# Get all SNPedia snps that are relevant. We also need to get orientation info | |
# in case we need to complement | |
snpedia_snps <- SNPediaR::getCategoryElements('On_chip_23andMe_v5') | |
snpedia_snps <- intersect(unique(sapply(str_split(snpedia_gts, '\\('), '[[', 1)), snpedia_snps) | |
snpedia_snp_tags <- getPages2(snpedia_snps, verbose = TRUE) | |
orientation <- as_tibble(do.call(rbind, lapply(seq_along(snpedia_snp_tags), function(i) { | |
page <- snpedia_snp_tags[[i]] | |
name <- names(snpedia_snp_tags)[i] | |
c('name' = name, extractSnpTags(page, c('rsid', 'StabilizedOrientation'))) | |
}))) | |
# Load all my SNP data | |
my_snps <- read_tsv(file.path('~/Downloads/', | |
'genome_Bastian_Schiffthaler_v5_Full_20190326095221.txt'), | |
col_names = c("rsid", "chr", "pos", "gt"), comment = '#', | |
col_types = 'ccnc') | |
my_snps$orientation <- orientation$StabilizedOrientation[match(my_snps$rsid, str_to_lower(orientation$name))] | |
my_snps <- my_snps %>% | |
# Filter out SNPs without orientation info in SNPedia | |
filter(!is.na(orientation)) %>% | |
# Complement SNPs on minus strand | |
mutate(gt_s = make_complement(gt, orientation)) %>% | |
# Create genotype string to query SNPedia | |
mutate(geno = paste0(str_to_title(rsid), '(', str_sub(gt_s, 1, 1), ';', str_sub(gt_s, 2, 2), ')')) | |
# Filter such that we only have SNPs with genotype pages in SNPedia | |
my_snps_f <- my_snps %>% | |
filter(geno %in% snpedia_gts) | |
# Get page data | |
my_pages <- getPages(my_snps_f$geno, wikiParseFunction = extractGenotypeTags) | |
# Wrap into a tibble | |
my_gt_info <- as_tibble(do.call(rbind, my_pages)) | |
my_gt_info %>% arrange(desc(magnitude)) %>% filter(repute == 'Good' & magnitude >= 1) -> x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment