zkamvar/poppr_patches.R

## poppr_patches.R
#==============================================================================#
# UPDATED: 2015-07-17
#
# USAGE/INSTALLATION:
#   with devtools - devtools::source_gist("b64078a0d04d2452c905")
#   downloaded    - source("poppr_patches.R")
#
# UPDATES:
#   Fix read.genalex ( see: https://github.com/grunwaldlab/poppr/issues/58 )
#
#==============================================================================#
fix_version <- package_version("2.0.2")
if (packageVersion("poppr") < fix_version) stop("Please install the latest version of poppr and rerun this script.")
if (packageVersion("poppr") > fix_version) stop("You likely have the correct version. If not, install from github.")


read.genalex <- function(genalex, ploidy = 2, geo = FALSE, region = FALSE,
                         genclone = TRUE, sep = ",", recode = FALSE){
  # The first two lines from a genalex file contain all of the information about
  # the structure of the file (except for ploidy and geographic info)
  gencall  <- match.call()

  all.info <- strsplit(readLines(genalex, n = 2), sep)
  cskip    <- ifelse("connection" %in% class(genalex), 0, 2)
  gena     <- utils::read.table(genalex, sep = sep, header = TRUE, skip = cskip,
                                stringsAsFactors = FALSE, check.names = FALSE)
  num.info <- as.numeric(all.info[[1]])
  pop.info <- all.info[[2]][-c(1:3)]
  num.info <- num.info[!is.na(num.info)]
  pop.info <- pop.info[!pop.info %in% c("", NA)]
  nloci    <- num.info[1]
  ninds    <- num.info[2]
  npops    <- num.info[3]

  # Removing all null columns
  if (any(is.na(gena[1, ]))){
    gena <- gena[, !is.na(gena[1, ]), drop = FALSE]
  }

  #----------------------------------------------------------------------------#
  # Checking for extra information such as Regions or XY coordinates
  #----------------------------------------------------------------------------#

  # Creating vectors that correspond to the different information fields. If the
  # regions are true, then the length of the pop.info should be equal to the
  # number of populations "npop"(npops) plus the number of regions which is the
  # npop+4th entry in the vector. Note that this strategy will only work if the
  # name of the first region does not match any of the populations.

  clm <- ncol(gena)

  if (region == TRUE && !is.na(num.info[npops + 4]) && length(pop.info) == npops + num.info[npops + 4]){
    # Info for the number of columns the loci can take on.
    loci.adj <- c(nloci, nloci*ploidy)

    # First question, do you have two or four extra columns? Two extra would
    # indicate no geographic data. Four extra would indicate geographic data.
    # Both of these indicate that, while a regional specification exists, a
    # column indicating the regions was not specified, so it needs to be created
    if (((clm %in% (loci.adj + 4)) & (geo == TRUE)) | (clm %in% (loci.adj + 2))){

      pop.vec <- gena[, 2]
      ind.vec <- gena[, 1]
      xy      <- gena[, c((clm-1), clm)]
      region.inds <- ((npops + 5):length(num.info)) # Indices for the regions
      reg.inds    <- num.info[region.inds] # Number of individuals per region
      reg.names   <- all.info[[2]][region.inds] # Names of the regions
      reg.vec     <- rep(reg.names, reg.inds) # Paste into a single vector
      if (geo == TRUE){
        geoinds <- c((clm - 1), clm)
        xy      <- gena[, geoinds]
        gena    <- gena[, -geoinds, drop = FALSE]
      } else {
        xy <- NULL
      }
      gena <- gena[, c(-1, -2), drop = FALSE]

    } else {

      pop.vec      <- ifelse(any(gena[, 1] == pop.info[1]), 1, 2)
      reg.vec      <- ifelse(pop.vec == 2, 1, 2)
      orig.ind.vec <- NULL
      reg.vec      <- gena[, reg.vec] # Regional Vector
      pop.vec      <- gena[, pop.vec] # Population Vector
      if (geo == TRUE){
        geoinds <- c((clm-1), clm)
        xy      <- gena[, geoinds]
        gena    <- gena[, -geoinds, drop = FALSE]
      } else {
        xy <- NULL
      }
      ind.vec <- gena[, clm] # Individual Vector
      gena    <- gena[, -c(1, 2, clm), drop = FALSE] # removing the non-genotypic columns
    }
  } else if (geo == TRUE & length(pop.info) == npops){
    # There are no Regions specified, but there are geographic coordinates
    reg.vec <- NULL
    pop.vec <- gena[, 2]
    ind.vec <- gena[, 1]
    xy      <- gena[, c((clm-1), clm)]
    gena    <- gena[, -c(1, 2, (clm-1), clm), drop = FALSE]
  } else {
    # There are no Regions or geographic coordinates
    reg.vec <- NULL
    pop.vec <- gena[, 2]
    ind.vec <- gena[, 1]
    xy <- NULL
    gena <- gena[, -c(1, 2), drop = FALSE]
  }

  #----------------------------------------------------------------------------#
  # The genotype matrix has been isolated at this point. Now this will
  # reconstruct the matrix in a way that adegenet likes it.
  #----------------------------------------------------------------------------#

  clm      <- ncol(gena)
  gena.mat <- as.matrix(gena)
  # Checking for greater than haploid data.
  if (nloci == clm/ploidy & ploidy > 1){
    # Missing data in genalex is coded as "0" for non-presence/absence data.
    # this converts it to "NA" for adegenet.
    if(any(gena.mat == "0") & ploidy < 3){
      gena[gena.mat == "0"] <- NA
    }
    type  <- 'codom'
    loci  <- which((1:clm) %% ploidy == 1)
    gena2 <- gena[, loci, drop = FALSE]

    # Collapsing all alleles into single loci.
    lapply(loci, function(x) gena2[, ((x-1)/ploidy)+1] <<-
             poppr:::pop_combiner(gena, hier = x:(x+ploidy-1), sep = "/"))

    # Treating missing data.
    gena2[gena2 == paste(rep("0", ploidy), collapse = "/")] <- NA_character_
    gena2[gena2 == paste(rep("NA", ploidy), collapse = "/")] <- NA_character_
    res.gid <- adegenet::df2genind(gena2, sep="/", ind.names = ind.vec, pop = pop.vec,
                         ploidy = ploidy, type = type)
  } else if (nloci == clm & all(gena.mat %in% as.integer(-1:1))) {
    # Checking for AFLP data.
    # Missing data in genalex is coded as "-1" for presence/absence data.
    # this converts it to "NA" for adegenet.
    if(any(gena.mat == -1L)){
      gena[gena.mat == -1L] <- NA
    }
    type <- 'PA'
    res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec,
                         ploidy = ploidy, type = type, ncode = 1)
  } else if (nloci == clm & !all(gena.mat %in% as.integer(-1:1))) {
    # Checking for haploid microsatellite data or SNP data
    if(any(gena.mat == "0")){
      gena[gena.mat == "0"] <- NA
    }
    type    <- 'codom'
    res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec,
                         ploidy = 1, type = type)
  } else {
    weirdomsg <- paste("Something went wrong. Please ensure that your data is",
                       "formatted correctly:\n\n",
                       "Is the ploidy flag set to the maximum ploidy of your data?\n",
                       ifelse(geo,
                              "You set geo = TRUE. Do you have geographic coordinates in the right place?\n",
                              "If you have geographic coordinates: did you set the flag?\n"),
                       ifelse(region,
                              "You set region = TRUE. Do you have regional data in the right place?\n\n",
                              "If you have regional data: did you set the flag?\n\n"),
                       "Otherwise, the problem may lie within the data structure itself.")
    stop(weirdomsg)
  }
  if (any(duplicated(ind.vec))){
    # ensuring that all names are unique
    adegenet::indNames(res.gid) <- paste("ind", 1:length(ind.vec))
    res.gid@other[["original_names"]] <- ind.vec
  }

  res.gid@call <- gencall
  ind.vec <- adegenet::indNames(res.gid)
  names(pop.vec) <- ind.vec

  # Keep the name if it's a URL
  if (length(grep("://", genalex)) < 1 & !"connection" %in% class(genalex)){
    res.gid@call[2] <- basename(genalex)
  }
  if (region){
    names(reg.vec) <- ind.vec
    adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec], Region = reg.vec[ind.vec])
  } else {
    adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec])
  }
  if (geo){
    if (!all(c("x", "y") %in% colnames(xy))){
      colnames(xy) <- c("x", "y")
    }
    if (nrow(xy) == length(ind.vec)){
      rownames(xy) <- ind.vec
      res.gid@other[["xy"]] <- xy[ind.vec, ]
    } else {
      res.gid@other[["xy"]] <- xy
    }

  }
  if (genclone){
    res.gid <- poppr::as.genclone(res.gid)
  }
  if (recode){
    res.gid <- poppr::recode_polyploids(res.gid, newploidy = TRUE)
  }
  return(res.gid)
}

message("\nfixing function 'read.genalex' ...")
assignInNamespace("read.genalex", read.genalex, ns = "poppr")

message("\nEverything is up to date!")

# Procedure:
# 1. Write fixed function
# 2. Display message to user
# 3. use assignInNamespace function. This does not care if the function is internal or external.

# message("\nfixing internal function 'fix_negative_branch'\n")
# assignInNamespace("fix_negative_branch", fix_negative_branch, ns = "poppr")
	#==============================================================================#
	# UPDATED: 2015-07-17
	#
	# USAGE/INSTALLATION:
	# with devtools - devtools::source_gist("b64078a0d04d2452c905")
	# downloaded - source("poppr_patches.R")
	#
	# UPDATES:
	# Fix read.genalex ( see: https://github.com/grunwaldlab/poppr/issues/58 )
	#
	#==============================================================================#
	fix_version <- package_version("2.0.2")
	if (packageVersion("poppr") < fix_version) stop("Please install the latest version of poppr and rerun this script.")
	if (packageVersion("poppr") > fix_version) stop("You likely have the correct version. If not, install from github.")


	read.genalex <- function(genalex, ploidy = 2, geo = FALSE, region = FALSE,
	genclone = TRUE, sep = ",", recode = FALSE){
	# The first two lines from a genalex file contain all of the information about
	# the structure of the file (except for ploidy and geographic info)
	gencall <- match.call()

	all.info <- strsplit(readLines(genalex, n = 2), sep)
	cskip <- ifelse("connection" %in% class(genalex), 0, 2)
	gena <- utils::read.table(genalex, sep = sep, header = TRUE, skip = cskip,
	stringsAsFactors = FALSE, check.names = FALSE)
	num.info <- as.numeric(all.info[[1]])
	pop.info <- all.info[[2]][-c(1:3)]
	num.info <- num.info[!is.na(num.info)]
	pop.info <- pop.info[!pop.info %in% c("", NA)]
	nloci <- num.info[1]
	ninds <- num.info[2]
	npops <- num.info[3]

	# Removing all null columns
	if (any(is.na(gena[1, ]))){
	gena <- gena[, !is.na(gena[1, ]), drop = FALSE]
	}

	#----------------------------------------------------------------------------#
	# Checking for extra information such as Regions or XY coordinates
	#----------------------------------------------------------------------------#

	# Creating vectors that correspond to the different information fields. If the
	# regions are true, then the length of the pop.info should be equal to the
	# number of populations "npop"(npops) plus the number of regions which is the
	# npop+4th entry in the vector. Note that this strategy will only work if the
	# name of the first region does not match any of the populations.

	clm <- ncol(gena)

	if (region == TRUE && !is.na(num.info[npops + 4]) && length(pop.info) == npops + num.info[npops + 4]){
	# Info for the number of columns the loci can take on.
	loci.adj <- c(nloci, nloci*ploidy)

	# First question, do you have two or four extra columns? Two extra would
	# indicate no geographic data. Four extra would indicate geographic data.
	# Both of these indicate that, while a regional specification exists, a
	# column indicating the regions was not specified, so it needs to be created
	if (((clm %in% (loci.adj + 4)) & (geo == TRUE)) \| (clm %in% (loci.adj + 2))){

	pop.vec <- gena[, 2]
	ind.vec <- gena[, 1]
	xy <- gena[, c((clm-1), clm)]
	region.inds <- ((npops + 5):length(num.info)) # Indices for the regions
	reg.inds <- num.info[region.inds] # Number of individuals per region
	reg.names <- all.info[[2]][region.inds] # Names of the regions
	reg.vec <- rep(reg.names, reg.inds) # Paste into a single vector
	if (geo == TRUE){
	geoinds <- c((clm - 1), clm)
	xy <- gena[, geoinds]
	gena <- gena[, -geoinds, drop = FALSE]
	} else {
	xy <- NULL
	}
	gena <- gena[, c(-1, -2), drop = FALSE]

	} else {

	pop.vec <- ifelse(any(gena[, 1] == pop.info[1]), 1, 2)
	reg.vec <- ifelse(pop.vec == 2, 1, 2)
	orig.ind.vec <- NULL
	reg.vec <- gena[, reg.vec] # Regional Vector
	pop.vec <- gena[, pop.vec] # Population Vector
	if (geo == TRUE){
	geoinds <- c((clm-1), clm)
	xy <- gena[, geoinds]
	gena <- gena[, -geoinds, drop = FALSE]
	} else {
	xy <- NULL
	}
	ind.vec <- gena[, clm] # Individual Vector
	gena <- gena[, -c(1, 2, clm), drop = FALSE] # removing the non-genotypic columns
	}
	} else if (geo == TRUE & length(pop.info) == npops){
	# There are no Regions specified, but there are geographic coordinates
	reg.vec <- NULL
	pop.vec <- gena[, 2]
	ind.vec <- gena[, 1]
	xy <- gena[, c((clm-1), clm)]
	gena <- gena[, -c(1, 2, (clm-1), clm), drop = FALSE]
	} else {
	# There are no Regions or geographic coordinates
	reg.vec <- NULL
	pop.vec <- gena[, 2]
	ind.vec <- gena[, 1]
	xy <- NULL
	gena <- gena[, -c(1, 2), drop = FALSE]
	}

	#----------------------------------------------------------------------------#
	# The genotype matrix has been isolated at this point. Now this will
	# reconstruct the matrix in a way that adegenet likes it.
	#----------------------------------------------------------------------------#

	clm <- ncol(gena)
	gena.mat <- as.matrix(gena)
	# Checking for greater than haploid data.
	if (nloci == clm/ploidy & ploidy > 1){
	# Missing data in genalex is coded as "0" for non-presence/absence data.
	# this converts it to "NA" for adegenet.
	if(any(gena.mat == "0") & ploidy < 3){
	gena[gena.mat == "0"] <- NA
	}
	type <- 'codom'
	loci <- which((1:clm) %% ploidy == 1)
	gena2 <- gena[, loci, drop = FALSE]

	# Collapsing all alleles into single loci.
	lapply(loci, function(x) gena2[, ((x-1)/ploidy)+1] <<-
	poppr:::pop_combiner(gena, hier = x:(x+ploidy-1), sep = "/"))

	# Treating missing data.
	gena2[gena2 == paste(rep("0", ploidy), collapse = "/")] <- NA_character_
	gena2[gena2 == paste(rep("NA", ploidy), collapse = "/")] <- NA_character_
	res.gid <- adegenet::df2genind(gena2, sep="/", ind.names = ind.vec, pop = pop.vec,
	ploidy = ploidy, type = type)
	} else if (nloci == clm & all(gena.mat %in% as.integer(-1:1))) {
	# Checking for AFLP data.
	# Missing data in genalex is coded as "-1" for presence/absence data.
	# this converts it to "NA" for adegenet.
	if(any(gena.mat == -1L)){
	gena[gena.mat == -1L] <- NA
	}
	type <- 'PA'
	res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec,
	ploidy = ploidy, type = type, ncode = 1)
	} else if (nloci == clm & !all(gena.mat %in% as.integer(-1:1))) {
	# Checking for haploid microsatellite data or SNP data
	if(any(gena.mat == "0")){
	gena[gena.mat == "0"] <- NA
	}
	type <- 'codom'
	res.gid <- adegenet::df2genind(gena, ind.names = ind.vec, pop = pop.vec,
	ploidy = 1, type = type)
	} else {
	weirdomsg <- paste("Something went wrong. Please ensure that your data is",
	"formatted correctly:\n\n",
	"Is the ploidy flag set to the maximum ploidy of your data?\n",
	ifelse(geo,
	"You set geo = TRUE. Do you have geographic coordinates in the right place?\n",
	"If you have geographic coordinates: did you set the flag?\n"),
	ifelse(region,
	"You set region = TRUE. Do you have regional data in the right place?\n\n",
	"If you have regional data: did you set the flag?\n\n"),
	"Otherwise, the problem may lie within the data structure itself.")
	stop(weirdomsg)
	}
	if (any(duplicated(ind.vec))){
	# ensuring that all names are unique
	adegenet::indNames(res.gid) <- paste("ind", 1:length(ind.vec))
	res.gid@other[["original_names"]] <- ind.vec
	}

	res.gid@call <- gencall
	ind.vec <- adegenet::indNames(res.gid)
	names(pop.vec) <- ind.vec

	# Keep the name if it's a URL
	if (length(grep("://", genalex)) < 1 & !"connection" %in% class(genalex)){
	res.gid@call[2] <- basename(genalex)
	}
	if (region){
	names(reg.vec) <- ind.vec
	adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec], Region = reg.vec[ind.vec])
	} else {
	adegenet::strata(res.gid) <- data.frame(Pop = pop.vec[ind.vec])
	}
	if (geo){
	if (!all(c("x", "y") %in% colnames(xy))){
	colnames(xy) <- c("x", "y")
	}
	if (nrow(xy) == length(ind.vec)){
	rownames(xy) <- ind.vec
	res.gid@other[["xy"]] <- xy[ind.vec, ]
	} else {
	res.gid@other[["xy"]] <- xy
	}

	}
	if (genclone){
	res.gid <- poppr::as.genclone(res.gid)
	}
	if (recode){
	res.gid <- poppr::recode_polyploids(res.gid, newploidy = TRUE)
	}
	return(res.gid)
	}

	message("\nfixing function 'read.genalex' ...")
	assignInNamespace("read.genalex", read.genalex, ns = "poppr")

	message("\nEverything is up to date!")

	# Procedure:
	# 1. Write fixed function
	# 2. Display message to user
	# 3. use assignInNamespace function. This does not care if the function is internal or external.

	# message("\nfixing internal function 'fix_negative_branch'\n")
	# assignInNamespace("fix_negative_branch", fix_negative_branch, ns = "poppr")