kgturner/Squishr.func.R

## Squishr.func.R
#Functions to squish groups of columns together
#Kathryn Turner Sept 20, 2013
#With help from Andrew MacDonald @polesasunder

#given data that have many NAs and non-overlapping values
#across several columns, with similarly named columns within a group
> head(clim)
      alt_07.tif alt_11.tif alt_12.tif alt_15.tif alt_16.tif alt_17.tif bio1_07.tif bio1_11.tif bio1_12.tif bio1_15.tif bio1_16.tif bio1_17.tif bio10_07.tif
BG001         NA         NA         NA         NA         61         NA          NA          NA          NA          NA         121          NA           NA
CA001         NA         24         NA         NA         NA         NA          NA         100          NA          NA          NA          NA           NA
CA008         NA         NA       1308         NA         NA         NA          NA          NA          39          NA          NA          NA           NA
CA009         NA         NA        599         NA         NA         NA          NA          NA          73          NA          NA          NA           NA
CA010         NA         NA        935         NA         NA         NA          NA          NA          57          NA          NA          NA           NA
GR001         NA         NA         NA         NA          1         NA          NA          NA          NA          NA         159          NA           NA

#Here, all columns that start with "alt" should be squished together,
#and all columns with "bio1" should be squished togther to eliminate NAs

#To squish a single group of columns
#dat is the dataframe, cols is the columns to be squished togther,
#newcol is the name of the new column of concentrated values.
#Returns dataframe with new column only.

squish <- function(dat=dat, cols=cols, newcol="newcol"){

  dat$temp <- NA

  for (i in dat[,cols]){
    ## where is temp NA?
    ss <- which(is.na(dat$temp))
    ## what are the values of i there? put them in temp
    dat$temp[ss] <- i[ss]
  }

  names(dat)[names(dat)=="temp"] <- newcol

  return(subset(dat,select=ncol(dat)))
}

#for one group of columns
clim.2 <- squish(dat=clim, cols=1:6, newcol="alt")
head(clim.2)
> clim.2
       alt
BG001   61
CA001   24
CA008 1308
CA009  599
CA010  935
GR001    1

#To squish multiple groups of colummns across a dataframe.

#List of names to give the new concentrated data columns,
#shared between all columns to be squished.
#dat is the dataframe, split is the character to split
#column names on.

squishsplit <- function(dat, split="_"){
  varnames <- colnames(dat)
  splitvars <- strsplit(varnames, split)
  squishvars <- sapply(splitvars,"[[",i=1)
  return(squishvars)
}

#Breaks up new column names and feeds them to squish
#dat is the dataframe, squishvar is the list of names
#produced by squishsplit.

squishr <- function(dat, squishvar){
  return(squish(dat=dat, cols=grep(squishvar,names(dat)),newcol=squishvar))
}

vars <- squishsplit(clim)

#lapply to each unique value of vars, and cbind into a pretty dataframe
test1 <- do.call(cbind,lapply(unique(vars),squishr,dat=clim))

> head(test1)
       alt bio1 bio10 bio11 bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9
BG001   61  121   211    31   519    60    35    17   160   112   120   135   72   26 7087  260  -11  271   91   62
CA001   24  100   161    41  1035   153    30    51   449   110   110   406   86   39 4700  224    7  217   48  161
CA008 1308   39   141   -61   455    52    28    22   139    94   118   127  116   33 7893  236 -112  348  -54   -6
CA009  599   73   178   -33   501    59    31    21   158   101   123   145  124   34 8165  277  -82  359  -25  130
CA010  935   57   160   -45   561    67    34    21   180   114   136   166  115   33 8008  255  -93  348  -38  154
GR001    1  159   255    65   445    54    17    29   153    67    69   128  112   34 7388  336   10  326  121  250

#120 columns condensed down to 20! No NAs!
	#Functions to squish groups of columns together
	#Kathryn Turner Sept 20, 2013
	#With help from Andrew MacDonald @polesasunder

	#given data that have many NAs and non-overlapping values
	#across several columns, with similarly named columns within a group
	> head(clim)
	alt_07.tif alt_11.tif alt_12.tif alt_15.tif alt_16.tif alt_17.tif bio1_07.tif bio1_11.tif bio1_12.tif bio1_15.tif bio1_16.tif bio1_17.tif bio10_07.tif
	BG001 NA NA NA NA 61 NA NA NA NA NA 121 NA NA
	CA001 NA 24 NA NA NA NA NA 100 NA NA NA NA NA
	CA008 NA NA 1308 NA NA NA NA NA 39 NA NA NA NA
	CA009 NA NA 599 NA NA NA NA NA 73 NA NA NA NA
	CA010 NA NA 935 NA NA NA NA NA 57 NA NA NA NA
	GR001 NA NA NA NA 1 NA NA NA NA NA 159 NA NA

	#Here, all columns that start with "alt" should be squished together,
	#and all columns with "bio1" should be squished togther to eliminate NAs

	#To squish a single group of columns
	#dat is the dataframe, cols is the columns to be squished togther,
	#newcol is the name of the new column of concentrated values.
	#Returns dataframe with new column only.

	squish <- function(dat=dat, cols=cols, newcol="newcol"){

	dat$temp <- NA

	for (i in dat[,cols]){
	## where is temp NA?
	ss <- which(is.na(dat$temp))
	## what are the values of i there? put them in temp
	dat$temp[ss] <- i[ss]
	}

	names(dat)[names(dat)=="temp"] <- newcol

	return(subset(dat,select=ncol(dat)))
	}

	#for one group of columns
	clim.2 <- squish(dat=clim, cols=1:6, newcol="alt")
	head(clim.2)
	> clim.2
	alt
	BG001 61
	CA001 24
	CA008 1308
	CA009 599
	CA010 935
	GR001 1

	#To squish multiple groups of colummns across a dataframe.

	#List of names to give the new concentrated data columns,
	#shared between all columns to be squished.
	#dat is the dataframe, split is the character to split
	#column names on.

	squishsplit <- function(dat, split="_"){
	varnames <- colnames(dat)
	splitvars <- strsplit(varnames, split)
	squishvars <- sapply(splitvars,"[[",i=1)
	return(squishvars)
	}

	#Breaks up new column names and feeds them to squish
	#dat is the dataframe, squishvar is the list of names
	#produced by squishsplit.

	squishr <- function(dat, squishvar){
	return(squish(dat=dat, cols=grep(squishvar,names(dat)),newcol=squishvar))
	}

	vars <- squishsplit(clim)

	#lapply to each unique value of vars, and cbind into a pretty dataframe
	test1 <- do.call(cbind,lapply(unique(vars),squishr,dat=clim))

	> head(test1)
	alt bio1 bio10 bio11 bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9
	BG001 61 121 211 31 519 60 35 17 160 112 120 135 72 26 7087 260 -11 271 91 62
	CA001 24 100 161 41 1035 153 30 51 449 110 110 406 86 39 4700 224 7 217 48 161
	CA008 1308 39 141 -61 455 52 28 22 139 94 118 127 116 33 7893 236 -112 348 -54 -6
	CA009 599 73 178 -33 501 59 31 21 158 101 123 145 124 34 8165 277 -82 359 -25 130
	CA010 935 57 160 -45 561 67 34 21 180 114 136 166 115 33 8008 255 -93 348 -38 154
	GR001 1 159 255 65 445 54 17 29 153 67 69 128 112 34 7388 336 10 326 121 250

	#120 columns condensed down to 20! No NAs!