Last active
August 29, 2015 14:10
-
-
Save zkamvar/b64078a0d04d2452c905 to your computer and use it in GitHub Desktop.
Script to fix any bugs that occur in the R code in poppr. This unfortunately does not fix bugs that occur in the C code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#==============================================================================# | |
# UPDATED: 2015-07-17 | |
# | |
# USAGE/INSTALLATION: | |
# with devtools - devtools::source_gist("b64078a0d04d2452c905") | |
# downloaded - source("poppr_patches.R") | |
# | |
# UPDATES: | |
# Fix read.genalex ( see: https://github.com/grunwaldlab/poppr/issues/58 ) | |
# | |
#==============================================================================# | |
fix_version <- package_version("2.0.2") | |
if (packageVersion("poppr") < fix_version) stop("Please install the latest version of poppr and rerun this script.") | |
if (packageVersion("poppr") > fix_version) stop("You likely have the correct version. If not, install from github.") | |
read.genalex <- function(genalex, ploidy = 2, geo = FALSE, region = FALSE, | |
genclone = TRUE, sep = ",", recode = FALSE){ | |
# The first two lines from a genalex file contain all of the information about | |
# the structure of the file (except for ploidy and geographic info) | |
gencall <- match.call() | |
all.info <- strsplit(readLines(genalex, n = 2), sep) | |
cskip <- ifelse("connection" %in% class(genalex), 0, 2) | |
gena <- utils::read.table(genalex, sep = sep, header = TRUE, skip = cskip, | |
stringsAsFactors = FALSE, check.names = FALSE) | |
num.info <- as.numeric(all.info[[1]]) | |
pop.info <- all.info[[2]][-c(1:3)] | |
num.info <- num.info[!is.na(num.info)] | |
pop.info <- pop.info[!pop.info %in% c("", NA)] | |
nloci <- num.info[1] | |
ninds <- num.info[2] | |
npops <- num.info[3] | |
# Removing all null columns | |
if (any(is.na(gena[1, ]))){ | |
gena <- gena[, !is.na(gena[1, ]), drop = FALSE] | |
} | |
#----------------------------------------------------------------------------# | |
# Checking for extra information such as Regions or XY coordinates | |
#----------------------------------------------------------------------------# | |
# Creating vectors that correspond to the different information fields. If the | |
# regions are true, then the length of the pop.info should be equal to the | |
# number of populations "npop"(npops) plus the number of regions which is the | |
# npop+4th entry in the vector. Note that this strategy will only work if the | |
# name of the first region does not match any of the populations. | |
clm <- ncol(gena) | |
if (region == TRUE && !is.na(num.info[npops + 4]) && length(pop.info) == npops + num.info[npops + 4]){ | |
# Info for the number of columns the loci can take on. | |
loci.adj <- c(nloci, nloci*ploidy) | |
# First question, do you have two or four extra columns? Two extra would | |
# indicate no geographic data. Four extra would indicate geographic data. | |
# Both of these indicate that, while a regional specification exists, a | |
# column indicating the regions was not specified, so it needs to be created | |
if (((clm %in% (loci.adj + 4)) & (geo == TRUE)) | (clm %in% (loci.adj + 2))){ | |
pop.vec <- gena[, 2] | |
ind.vec <- gena[, 1] | |
xy <- gena[, c((clm-1), clm)] | |
region.inds <- ((npops + 5):length(num.info)) # Indices for the regions | |
reg.inds <- num.info[region.inds] # Number of individuals per region | |
reg.names <- all.info[[2]][region.inds] # Names of the regions | |
reg.vec <- rep(reg.names, reg.inds) # Paste into a single vector | |
if (geo == TRUE){ | |
geoinds <- c((clm - 1), clm) | |
xy <- gena[, geoinds] | |
gena <- gena[, -geoinds, drop = FALSE] | |
} else { | |
xy <- NULL | |
} | |
gena <- gena[, c(-1, -2), drop = FALSE] | |
} else { | |
pop.vec <- ifelse(any(gena[, 1] == pop.info[1]), 1, 2) | |
reg.vec <- ifelse(pop.vec == 2, 1, 2) | |
orig.ind.vec <- NULL | |
reg.vec <- gena[, reg.vec] # Regional Vector | |
pop.vec <- gena[, pop.vec] # Population Vector | |
if (geo == TRUE){ | |
geoinds <- c((clm-1), clm) | |
xy <- gena[, geoinds] | |
gena <- gena[, -geoinds, drop = FALSE] | |
} else { | |
xy <- NULL | |
} | |
ind.vec <- gena[, clm] # Individual Vector | |
gena <- gena[, -c(1, 2, clm), drop = FALSE] # removing the non-genotypic columns | |
} | |
} else if (geo == TRUE & length(pop.info) == npops){ | |
# There are no Regions specified, but there are geographic coordinates | |
reg.vec <- NULL | |
pop.vec <- gena[, 2] | |
ind.vec <- gena[, 1] | |
xy <- gena[, c((clm-1), clm)] | |
gena <- gena[, -c(1, 2, (clm-1), clm), drop = FALSE] | |
} else { | |
# There are no Regions or geographic coordinates | |
reg.vec <- NULL | |
pop.vec <- gena[, 2] | |
ind.vec <- gena[, 1] | |
xy <- NULL | |
gena <- gena[, -c(1, 2), drop = FALSE] | |
} | |
#----------------------------------------------------------------------------# | |
# The genotype matrix has been isolated at this point. Now this will | |
# reconstruct the matrix in a way that adegenet likes it. | |
#----------------------------------------------------------------------------# | |
clm <- ncol(gena) | |
gena.mat <- as.matrix(gena) | |
# Checking for greater than haploid data. | |
if (nloci == clm/ploidy & ploidy > 1){ | |
# Missing data in genalex is coded as "0" for non-presence/absence data. | |
# this converts it to "NA" for adegenet. | |
if(any(gena.mat == "0") & ploidy < 3){ | |
gena[gena.mat == "0"] <- NA | |
} | |
type <- 'codom' | |
loci <- which((1:clm) %% ploidy == 1) | |
gena2 <- gena[, loci, drop = FALSE] | |
# Collapsing all alleles into single loci. | |
lapply(loci, function(x) gena2[, ((x-1)/ploidy)+1] <<- | |
poppr:::pop_combiner(gena, hier = x:(x+ploidy-1), sep = "/")) | |
# Treating missing data. | |
gena2[gena2 == paste(rep("0", ploidy), collapse = "/")] <- NA_character_ | |
gena2[gena2 == paste(rep("NA", ploidy), collapse = "/")] <- NA_character_ | |
res.gid <- adegenet::df2genind(gena2, sep="/", ind.names = ind.vec, pop = pop.vec, | |
ploidy = ploidy, type = type) | |
} else if (nloci == clm & all(gena.mat %in% as.integer(-1:1))) { | |
# Checking for AFLP data. | |
# Missing data in genalex is coded as "-1" for presence/absence data. | |
# this converts it to "NA" for adegenet. | |
if(any(gena.mat == -1L)){ | |
gena[gena.mat == -1L] <- NA | |
} | |
type <- 'PA' | |
res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec, | |
ploidy = ploidy, type = type, ncode = 1) | |
} else if (nloci == clm & !all(gena.mat %in% as.integer(-1:1))) { | |
# Checking for haploid microsatellite data or SNP data | |
if(any(gena.mat == "0")){ | |
gena[gena.mat == "0"] <- NA | |
} | |
type <- 'codom' | |
res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec, | |
ploidy = 1, type = type) | |
} else { | |
weirdomsg <- paste("Something went wrong. Please ensure that your data is", | |
"formatted correctly:\n\n", | |
"Is the ploidy flag set to the maximum ploidy of your data?\n", | |
ifelse(geo, | |
"You set geo = TRUE. Do you have geographic coordinates in the right place?\n", | |
"If you have geographic coordinates: did you set the flag?\n"), | |
ifelse(region, | |
"You set region = TRUE. Do you have regional data in the right place?\n\n", | |
"If you have regional data: did you set the flag?\n\n"), | |
"Otherwise, the problem may lie within the data structure itself.") | |
stop(weirdomsg) | |
} | |
if (any(duplicated(ind.vec))){ | |
# ensuring that all names are unique | |
adegenet::indNames(res.gid) <- paste("ind", 1:length(ind.vec)) | |
res.gid@other[["original_names"]] <- ind.vec | |
} | |
res.gid@call <- gencall | |
ind.vec <- adegenet::indNames(res.gid) | |
names(pop.vec) <- ind.vec | |
# Keep the name if it's a URL | |
if (length(grep("://", genalex)) < 1 & !"connection" %in% class(genalex)){ | |
res.gid@call[2] <- basename(genalex) | |
} | |
if (region){ | |
names(reg.vec) <- ind.vec | |
adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec], Region = reg.vec[ind.vec]) | |
} else { | |
adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec]) | |
} | |
if (geo){ | |
if (!all(c("x", "y") %in% colnames(xy))){ | |
colnames(xy) <- c("x", "y") | |
} | |
if (nrow(xy) == length(ind.vec)){ | |
rownames(xy) <- ind.vec | |
res.gid@other[["xy"]] <- xy[ind.vec, ] | |
} else { | |
res.gid@other[["xy"]] <- xy | |
} | |
} | |
if (genclone){ | |
res.gid <- poppr::as.genclone(res.gid) | |
} | |
if (recode){ | |
res.gid <- poppr::recode_polyploids(res.gid, newploidy = TRUE) | |
} | |
return(res.gid) | |
} | |
message("\nfixing function 'read.genalex' ...") | |
assignInNamespace("read.genalex", read.genalex, ns = "poppr") | |
message("\nEverything is up to date!") | |
# Procedure: | |
# 1. Write fixed function | |
# 2. Display message to user | |
# 3. use assignInNamespace function. This does not care if the function is internal or external. | |
# message("\nfixing internal function 'fix_negative_branch'\n") | |
# assignInNamespace("fix_negative_branch", fix_negative_branch, ns = "poppr") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To use this script with devtools: