zkamvar/hapmap2df.r

## hapmap2df.r
#==============================================================================#
# hapmap2df.r Convert hapmap files to a data frame.
#
# Author: Zhian N. Kamvar
# License: GPLv3
# Year: 2013
# Waranty: NONE
#
# This function will read in hapmap formatted files and convert them to a data
# frame for the purposes of importing them to genlight objects. Example:
#
# if(!require(adegenet)){
#   install.packages("adegenet")
# }
# hmp <- read.table(my_hapmap_file.hmp, sep = "\t", skip = 1)
# names(hmp) <- strsplit(readLines(my_hapmap_file.hmp, 1), "\t")[[1]]
# hmp.df <- hapmap2df(hmp)
#
# # If you have windows:
# hmp.gl <- new("genlight", hmp.df, mc.cores = 1L)
#
# # Else:
# hmp.gl <- new("genlight", hmp.df)
#
# # Visual stuff from the adegenet genomics manual:
# plot(hmp.gl)
# myFreq <- glMean(hmp.gl)
# myFreq <- c(myFreq, 1-myFreq)
# hist(myFreq, proba = TRUE, col = "darkseagreen3", xlab = "Allele frequencies",
#      main = "Distribution of allele frequencies", nclass = 20)
# temp <- density(myFreq, bw = .05)
# lines(temp$x, temp$y*2, lwd = 3)
#==============================================================================#

hapmap2df <- function(x, gt.col = 2, first.col = 12, last.col = ncol(x)){
  ambi <- c("S","M","Y","R","K","W")
  names(ambi) <- c("A/T","C/G","A/C","G/T","A/G","C/T")
  genotypes <- levels(x[[gt.col]])
  major_alleles <- unlist(strsplit(genotypes, "/"))
  major_alleles <- major_alleles[1:(length(major_alleles)) %% 2 == 1]
  print(major_alleles)
  levels(x[[gt.col]]) <- major_alleles
  gtlevs <- unique(levels(x[[gt.col]]))
  print(gtlevs)
  x[c(gt.col, first.col:last.col)] <- lapply(x[c(gt.col, first.col:last.col)], as.character)
  x[first.col:last.col][apply(x[first.col:last.col], 2, `==`, x[[gt.col]])] <- 2
  x[first.col:last.col][apply(x[first.col:last.col], 2, `%in%`, gtlevs)] <- 0
  x[first.col:last.col][apply(x[first.col:last.col], 2, `%in%`, "N")] <- NA
  x[first.col:last.col][apply(x[first.col:last.col], 2, `%in%`, ambi)] <- 1
  out_matrix <- t(x[first.col:last.col])
  out_matrix <- apply(out_matrix, 2, as.numeric)
  colnames(out_matrix) <- x[[1]]
  rownames(out_matrix) <- names(x)[first.col:last.col]
  return(out_matrix)
}

if ((.Platform)$OS.type == "windows"){
  mclapply <- function (X, FUN, ..., mc.preschedule = TRUE, mc.set.seed = TRUE,
                        mc.silent = FALSE, mc.cores = 1L, mc.cleanup = TRUE,
                        mc.allow.recursive = TRUE){
    cores <- as.integer(mc.cores)
    if (cores < 1L)
      stop("'mc.cores' must be >= 1")
    if (cores > 1L)
      lapply(X, FUN, ...)
  }
}
	#==============================================================================#
	# hapmap2df.r Convert hapmap files to a data frame.
	#
	# Author: Zhian N. Kamvar
	# License: GPLv3
	# Year: 2013
	# Waranty: NONE
	#
	# This function will read in hapmap formatted files and convert them to a data
	# frame for the purposes of importing them to genlight objects. Example:
	#
	# if(!require(adegenet)){
	# install.packages("adegenet")
	# }
	# hmp <- read.table(my_hapmap_file.hmp, sep = "\t", skip = 1)
	# names(hmp) <- strsplit(readLines(my_hapmap_file.hmp, 1), "\t")[[1]]
	# hmp.df <- hapmap2df(hmp)
	#
	# # If you have windows:
	# hmp.gl <- new("genlight", hmp.df, mc.cores = 1L)
	#
	# # Else:
	# hmp.gl <- new("genlight", hmp.df)
	#
	# # Visual stuff from the adegenet genomics manual:
	# plot(hmp.gl)
	# myFreq <- glMean(hmp.gl)
	# myFreq <- c(myFreq, 1-myFreq)
	# hist(myFreq, proba = TRUE, col = "darkseagreen3", xlab = "Allele frequencies",
	# main = "Distribution of allele frequencies", nclass = 20)
	# temp <- density(myFreq, bw = .05)
	# lines(temp$x, temp$y*2, lwd = 3)
	#==============================================================================#

	hapmap2df <- function(x, gt.col = 2, first.col = 12, last.col = ncol(x)){
	ambi <- c("S","M","Y","R","K","W")
	names(ambi) <- c("A/T","C/G","A/C","G/T","A/G","C/T")
	genotypes <- levels(x[[gt.col]])
	major_alleles <- unlist(strsplit(genotypes, "/"))
	major_alleles <- major_alleles[1:(length(major_alleles)) %% 2 == 1]
	print(major_alleles)
	levels(x[[gt.col]]) <- major_alleles
	gtlevs <- unique(levels(x[[gt.col]]))
	print(gtlevs)
	x[c(gt.col, first.col:last.col)] <- lapply(x[c(gt.col, first.col:last.col)], as.character)
	x[first.col:last.col][apply(x[first.col:last.col], 2, `==`, x[[gt.col]])] <- 2
	x[first.col:last.col][apply(x[first.col:last.col], 2, `%in%`, gtlevs)] <- 0
	x[first.col:last.col][apply(x[first.col:last.col], 2, `%in%`, "N")] <- NA
	x[first.col:last.col][apply(x[first.col:last.col], 2, `%in%`, ambi)] <- 1
	out_matrix <- t(x[first.col:last.col])
	out_matrix <- apply(out_matrix, 2, as.numeric)
	colnames(out_matrix) <- x[[1]]
	rownames(out_matrix) <- names(x)[first.col:last.col]
	return(out_matrix)
	}

	if ((.Platform)$OS.type == "windows"){
	mclapply <- function (X, FUN, ..., mc.preschedule = TRUE, mc.set.seed = TRUE,
	mc.silent = FALSE, mc.cores = 1L, mc.cleanup = TRUE,
	mc.allow.recursive = TRUE){
	cores <- as.integer(mc.cores)
	if (cores < 1L)
	stop("'mc.cores' must be >= 1")
	if (cores > 1L)
	lapply(X, FUN, ...)
	}
	}