bschiffthaler/snpedia.R

## snpedia.R
library(SNPediaR)
library(tidyverse)

# Original function has som issues with parsing that need fixing
getPages2 <- function (titles, verbose = FALSE, limit = 50, wikiParseFunction = identity,
                       baseURL, format, query, ...)
{
  if (missing(baseURL))
    baseURL <- "https://bots.snpedia.com/api.php"
  if (missing(format))
    format <- "format=json"
  if (missing(query))
    query <- "action=query&prop=revisions&rvprop=content&titles="
  baseURL <- paste0(baseURL, "?", format, "&", query)
  Np <- length(titles)
  Cp <- 0
  titles <- RCurl::curlEscape(titles)
  n.batches <- 1 + length(titles)%/%limit
  suppressWarnings(titles <- split(titles, 1:n.batches))
  titles <- sapply(titles, paste, collapse = "|")
  res <- list()
  for (tit in titles) {
    pagesURL <- paste0(baseURL, tit)
    if (verbose) {
      Cp <- Cp + limit
      cat(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "Downloading",
          min(Cp, Np), "of", Np, "pages ...", pagesURL,
          fill = TRUE)
    }
    curl_success = FALSE
    while (! curl_success)
    {
      tryCatch({
        datos <- RCurl::getURL(pagesURL)
        datos <- gsub("\\n", "\\\\n", datos)
        datos <- gsub("\\t", "\\\\t", datos)
        datos <- gsub("\\\\\\\\x", "~x", datos)
        datos <- gsub("\\\\x", "~x", datos)
        datos <- gsub("\\| *", "\\|", datos)
        datos <- gsub(" *= *", "=", datos)
        datos <- gsub("w\\\\o", "without", datos)
        datos <- jsonlite::fromJSON(datos)
        datos <- datos[["query"]][["pages"]]
        nombres <- sapply(datos, function(x) x[["title"]])
        datos <- lapply(datos, function(x) x[["revisions"]][["*"]])
        names(datos) <- nombres
        datos <- lapply(datos, wikiParseFunction, ...)
        res <- c(res, datos)
        curl_success <- TRUE
      }, finally = {})
    }
  }
  return(res)
}

# Needed to orient some SNPs in accordance with SNPedia. 23andme reports all SNPs
# on the plus strand, but SNPedia reports some on minus.
complement <- c('A' = 'T', 'C' = 'G', 'G' = 'C', 'T' = 'A', 'D' = 'D', '-' = '-', 'I' = 'I')

make_complement <- function(x, o) {
  sapply(seq_along(x), function(i){
    y <- x[i]
    if (o[i] == 'minus') {
      paste(complement[unlist(str_split(y, ''))], collapse = '')
    } else {
      y
    }
  })
}

# Get all pages in SNPedia with a genotype entry (takes a while)
snpedia_gts <- SNPediaR::getCategoryElements('Is_a_genotype')

# Get all SNPedia snps that are relevant. We also need to get orientation info
# in case we need to complement
snpedia_snps <- SNPediaR::getCategoryElements('On_chip_23andMe_v5')
snpedia_snps <- intersect(unique(sapply(str_split(snpedia_gts, '\\('), '[[', 1)), snpedia_snps)
snpedia_snp_tags <- getPages2(snpedia_snps, verbose = TRUE)
orientation <- as_tibble(do.call(rbind, lapply(seq_along(snpedia_snp_tags), function(i) {
  page <- snpedia_snp_tags[[i]]
  name <- names(snpedia_snp_tags)[i]
  c('name' = name, extractSnpTags(page, c('rsid', 'StabilizedOrientation')))
})))

# Load all my SNP data
my_snps <- read_tsv(file.path('~/Downloads/',
                              'genome_Bastian_Schiffthaler_v5_Full_20190326095221.txt'),
                    col_names = c("rsid", "chr", "pos", "gt"), comment = '#',
                    col_types = 'ccnc')

my_snps$orientation <- orientation$StabilizedOrientation[match(my_snps$rsid, str_to_lower(orientation$name))]
my_snps <- my_snps %>%
  # Filter out SNPs without orientation info in SNPedia
  filter(!is.na(orientation)) %>%
  # Complement SNPs on minus strand
  mutate(gt_s = make_complement(gt, orientation)) %>%
  # Create genotype string to query SNPedia
  mutate(geno = paste0(str_to_title(rsid), '(', str_sub(gt_s, 1, 1), ';', str_sub(gt_s, 2, 2), ')'))

# Filter such that we only have SNPs with genotype pages in SNPedia
my_snps_f <- my_snps %>%
  filter(geno %in% snpedia_gts)

# Get page data
my_pages <- getPages(my_snps_f$geno, wikiParseFunction = extractGenotypeTags)

# Wrap into a tibble
my_gt_info <- as_tibble(do.call(rbind, my_pages))
my_gt_info %>% arrange(desc(magnitude)) %>% filter(repute == 'Good' & magnitude >= 1) -> x
	library(SNPediaR)
	library(tidyverse)

	# Original function has som issues with parsing that need fixing
	getPages2 <- function (titles, verbose = FALSE, limit = 50, wikiParseFunction = identity,
	baseURL, format, query, ...)
	{
	if (missing(baseURL))
	baseURL <- "https://bots.snpedia.com/api.php"
	if (missing(format))
	format <- "format=json"
	if (missing(query))
	query <- "action=query&prop=revisions&rvprop=content&titles="
	baseURL <- paste0(baseURL, "?", format, "&", query)
	Np <- length(titles)
	Cp <- 0
	titles <- RCurl::curlEscape(titles)
	n.batches <- 1 + length(titles)%/%limit
	suppressWarnings(titles <- split(titles, 1:n.batches))
	titles <- sapply(titles, paste, collapse = "\|")
	res <- list()
	for (tit in titles) {
	pagesURL <- paste0(baseURL, tit)
	if (verbose) {
	Cp <- Cp + limit
	cat(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "Downloading",
	min(Cp, Np), "of", Np, "pages ...", pagesURL,
	fill = TRUE)
	}
	curl_success = FALSE
	while (! curl_success)
	{
	tryCatch({
	datos <- RCurl::getURL(pagesURL)
	datos <- gsub("\\n", "\\\\n", datos)
	datos <- gsub("\\t", "\\\\t", datos)
	datos <- gsub("\\\\\\\\x", "~x", datos)
	datos <- gsub("\\\\x", "~x", datos)
	datos <- gsub("\\\| *", "\\\|", datos)
	datos <- gsub(" = ", "=", datos)
	datos <- gsub("w\\\\o", "without", datos)
	datos <- jsonlite::fromJSON(datos)
	datos <- datos[["query"]][["pages"]]
	nombres <- sapply(datos, function(x) x[["title"]])
	datos <- lapply(datos, function(x) x[["revisions"]][["*"]])
	names(datos) <- nombres
	datos <- lapply(datos, wikiParseFunction, ...)
	res <- c(res, datos)
	curl_success <- TRUE
	}, finally = {})
	}
	}
	return(res)
	}

	# Needed to orient some SNPs in accordance with SNPedia. 23andme reports all SNPs
	# on the plus strand, but SNPedia reports some on minus.
	complement <- c('A' = 'T', 'C' = 'G', 'G' = 'C', 'T' = 'A', 'D' = 'D', '-' = '-', 'I' = 'I')

	make_complement <- function(x, o) {
	sapply(seq_along(x), function(i){
	y <- x[i]
	if (o[i] == 'minus') {
	paste(complement[unlist(str_split(y, ''))], collapse = '')
	} else {
	y
	}
	})
	}

	# Get all pages in SNPedia with a genotype entry (takes a while)
	snpedia_gts <- SNPediaR::getCategoryElements('Is_a_genotype')

	# Get all SNPedia snps that are relevant. We also need to get orientation info
	# in case we need to complement
	snpedia_snps <- SNPediaR::getCategoryElements('On_chip_23andMe_v5')
	snpedia_snps <- intersect(unique(sapply(str_split(snpedia_gts, '\\('), '[[', 1)), snpedia_snps)
	snpedia_snp_tags <- getPages2(snpedia_snps, verbose = TRUE)
	orientation <- as_tibble(do.call(rbind, lapply(seq_along(snpedia_snp_tags), function(i) {
	page <- snpedia_snp_tags[[i]]
	name <- names(snpedia_snp_tags)[i]
	c('name' = name, extractSnpTags(page, c('rsid', 'StabilizedOrientation')))
	})))

	# Load all my SNP data
	my_snps <- read_tsv(file.path('~/Downloads/',
	'genome_Bastian_Schiffthaler_v5_Full_20190326095221.txt'),
	col_names = c("rsid", "chr", "pos", "gt"), comment = '#',
	col_types = 'ccnc')

	my_snps$orientation <- orientation$StabilizedOrientation[match(my_snps$rsid, str_to_lower(orientation$name))]
	my_snps <- my_snps %>%
	# Filter out SNPs without orientation info in SNPedia
	filter(!is.na(orientation)) %>%
	# Complement SNPs on minus strand
	mutate(gt_s = make_complement(gt, orientation)) %>%
	# Create genotype string to query SNPedia
	mutate(geno = paste0(str_to_title(rsid), '(', str_sub(gt_s, 1, 1), ';', str_sub(gt_s, 2, 2), ')'))

	# Filter such that we only have SNPs with genotype pages in SNPedia
	my_snps_f <- my_snps %>%
	filter(geno %in% snpedia_gts)

	# Get page data
	my_pages <- getPages(my_snps_f$geno, wikiParseFunction = extractGenotypeTags)

	# Wrap into a tibble
	my_gt_info <- as_tibble(do.call(rbind, my_pages))
	my_gt_info %>% arrange(desc(magnitude)) %>% filter(repute == 'Good' & magnitude >= 1) -> x