Skip to content

Instantly share code, notes, and snippets.

@zkamvar
Last active August 29, 2015 14:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zkamvar/b64078a0d04d2452c905 to your computer and use it in GitHub Desktop.
Save zkamvar/b64078a0d04d2452c905 to your computer and use it in GitHub Desktop.
Script to fix any bugs that occur in the R code in poppr. This unfortunately does not fix bugs that occur in the C code.
#==============================================================================#
# UPDATED: 2015-07-17
#
# USAGE/INSTALLATION:
# with devtools - devtools::source_gist("b64078a0d04d2452c905")
# downloaded - source("poppr_patches.R")
#
# UPDATES:
# Fix read.genalex ( see: https://github.com/grunwaldlab/poppr/issues/58 )
#
#==============================================================================#
fix_version <- package_version("2.0.2")
if (packageVersion("poppr") < fix_version) stop("Please install the latest version of poppr and rerun this script.")
if (packageVersion("poppr") > fix_version) stop("You likely have the correct version. If not, install from github.")
read.genalex <- function(genalex, ploidy = 2, geo = FALSE, region = FALSE,
genclone = TRUE, sep = ",", recode = FALSE){
# The first two lines from a genalex file contain all of the information about
# the structure of the file (except for ploidy and geographic info)
gencall <- match.call()
all.info <- strsplit(readLines(genalex, n = 2), sep)
cskip <- ifelse("connection" %in% class(genalex), 0, 2)
gena <- utils::read.table(genalex, sep = sep, header = TRUE, skip = cskip,
stringsAsFactors = FALSE, check.names = FALSE)
num.info <- as.numeric(all.info[[1]])
pop.info <- all.info[[2]][-c(1:3)]
num.info <- num.info[!is.na(num.info)]
pop.info <- pop.info[!pop.info %in% c("", NA)]
nloci <- num.info[1]
ninds <- num.info[2]
npops <- num.info[3]
# Removing all null columns
if (any(is.na(gena[1, ]))){
gena <- gena[, !is.na(gena[1, ]), drop = FALSE]
}
#----------------------------------------------------------------------------#
# Checking for extra information such as Regions or XY coordinates
#----------------------------------------------------------------------------#
# Creating vectors that correspond to the different information fields. If the
# regions are true, then the length of the pop.info should be equal to the
# number of populations "npop"(npops) plus the number of regions which is the
# npop+4th entry in the vector. Note that this strategy will only work if the
# name of the first region does not match any of the populations.
clm <- ncol(gena)
if (region == TRUE && !is.na(num.info[npops + 4]) && length(pop.info) == npops + num.info[npops + 4]){
# Info for the number of columns the loci can take on.
loci.adj <- c(nloci, nloci*ploidy)
# First question, do you have two or four extra columns? Two extra would
# indicate no geographic data. Four extra would indicate geographic data.
# Both of these indicate that, while a regional specification exists, a
# column indicating the regions was not specified, so it needs to be created
if (((clm %in% (loci.adj + 4)) & (geo == TRUE)) | (clm %in% (loci.adj + 2))){
pop.vec <- gena[, 2]
ind.vec <- gena[, 1]
xy <- gena[, c((clm-1), clm)]
region.inds <- ((npops + 5):length(num.info)) # Indices for the regions
reg.inds <- num.info[region.inds] # Number of individuals per region
reg.names <- all.info[[2]][region.inds] # Names of the regions
reg.vec <- rep(reg.names, reg.inds) # Paste into a single vector
if (geo == TRUE){
geoinds <- c((clm - 1), clm)
xy <- gena[, geoinds]
gena <- gena[, -geoinds, drop = FALSE]
} else {
xy <- NULL
}
gena <- gena[, c(-1, -2), drop = FALSE]
} else {
pop.vec <- ifelse(any(gena[, 1] == pop.info[1]), 1, 2)
reg.vec <- ifelse(pop.vec == 2, 1, 2)
orig.ind.vec <- NULL
reg.vec <- gena[, reg.vec] # Regional Vector
pop.vec <- gena[, pop.vec] # Population Vector
if (geo == TRUE){
geoinds <- c((clm-1), clm)
xy <- gena[, geoinds]
gena <- gena[, -geoinds, drop = FALSE]
} else {
xy <- NULL
}
ind.vec <- gena[, clm] # Individual Vector
gena <- gena[, -c(1, 2, clm), drop = FALSE] # removing the non-genotypic columns
}
} else if (geo == TRUE & length(pop.info) == npops){
# There are no Regions specified, but there are geographic coordinates
reg.vec <- NULL
pop.vec <- gena[, 2]
ind.vec <- gena[, 1]
xy <- gena[, c((clm-1), clm)]
gena <- gena[, -c(1, 2, (clm-1), clm), drop = FALSE]
} else {
# There are no Regions or geographic coordinates
reg.vec <- NULL
pop.vec <- gena[, 2]
ind.vec <- gena[, 1]
xy <- NULL
gena <- gena[, -c(1, 2), drop = FALSE]
}
#----------------------------------------------------------------------------#
# The genotype matrix has been isolated at this point. Now this will
# reconstruct the matrix in a way that adegenet likes it.
#----------------------------------------------------------------------------#
clm <- ncol(gena)
gena.mat <- as.matrix(gena)
# Checking for greater than haploid data.
if (nloci == clm/ploidy & ploidy > 1){
# Missing data in genalex is coded as "0" for non-presence/absence data.
# this converts it to "NA" for adegenet.
if(any(gena.mat == "0") & ploidy < 3){
gena[gena.mat == "0"] <- NA
}
type <- 'codom'
loci <- which((1:clm) %% ploidy == 1)
gena2 <- gena[, loci, drop = FALSE]
# Collapsing all alleles into single loci.
lapply(loci, function(x) gena2[, ((x-1)/ploidy)+1] <<-
poppr:::pop_combiner(gena, hier = x:(x+ploidy-1), sep = "/"))
# Treating missing data.
gena2[gena2 == paste(rep("0", ploidy), collapse = "/")] <- NA_character_
gena2[gena2 == paste(rep("NA", ploidy), collapse = "/")] <- NA_character_
res.gid <- adegenet::df2genind(gena2, sep="/", ind.names = ind.vec, pop = pop.vec,
ploidy = ploidy, type = type)
} else if (nloci == clm & all(gena.mat %in% as.integer(-1:1))) {
# Checking for AFLP data.
# Missing data in genalex is coded as "-1" for presence/absence data.
# this converts it to "NA" for adegenet.
if(any(gena.mat == -1L)){
gena[gena.mat == -1L] <- NA
}
type <- 'PA'
res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec,
ploidy = ploidy, type = type, ncode = 1)
} else if (nloci == clm & !all(gena.mat %in% as.integer(-1:1))) {
# Checking for haploid microsatellite data or SNP data
if(any(gena.mat == "0")){
gena[gena.mat == "0"] <- NA
}
type <- 'codom'
res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec,
ploidy = 1, type = type)
} else {
weirdomsg <- paste("Something went wrong. Please ensure that your data is",
"formatted correctly:\n\n",
"Is the ploidy flag set to the maximum ploidy of your data?\n",
ifelse(geo,
"You set geo = TRUE. Do you have geographic coordinates in the right place?\n",
"If you have geographic coordinates: did you set the flag?\n"),
ifelse(region,
"You set region = TRUE. Do you have regional data in the right place?\n\n",
"If you have regional data: did you set the flag?\n\n"),
"Otherwise, the problem may lie within the data structure itself.")
stop(weirdomsg)
}
if (any(duplicated(ind.vec))){
# ensuring that all names are unique
adegenet::indNames(res.gid) <- paste("ind", 1:length(ind.vec))
res.gid@other[["original_names"]] <- ind.vec
}
res.gid@call <- gencall
ind.vec <- adegenet::indNames(res.gid)
names(pop.vec) <- ind.vec
# Keep the name if it's a URL
if (length(grep("://", genalex)) < 1 & !"connection" %in% class(genalex)){
res.gid@call[2] <- basename(genalex)
}
if (region){
names(reg.vec) <- ind.vec
adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec], Region = reg.vec[ind.vec])
} else {
adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec])
}
if (geo){
if (!all(c("x", "y") %in% colnames(xy))){
colnames(xy) <- c("x", "y")
}
if (nrow(xy) == length(ind.vec)){
rownames(xy) <- ind.vec
res.gid@other[["xy"]] <- xy[ind.vec, ]
} else {
res.gid@other[["xy"]] <- xy
}
}
if (genclone){
res.gid <- poppr::as.genclone(res.gid)
}
if (recode){
res.gid <- poppr::recode_polyploids(res.gid, newploidy = TRUE)
}
return(res.gid)
}
message("\nfixing function 'read.genalex' ...")
assignInNamespace("read.genalex", read.genalex, ns = "poppr")
message("\nEverything is up to date!")
# Procedure:
# 1. Write fixed function
# 2. Display message to user
# 3. use assignInNamespace function. This does not care if the function is internal or external.
# message("\nfixing internal function 'fix_negative_branch'\n")
# assignInNamespace("fix_negative_branch", fix_negative_branch, ns = "poppr")
@zkamvar
Copy link
Author

zkamvar commented Feb 13, 2015

To use this script with devtools:

devtools::source_gist("b64078a0d04d2452c905")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment