egouldo/KfoldCASEFILE.Rd

## KfoldCASEFILE.Rd
## FUNCTION - KfoldCASEFILE
# splits an R object of a case file into 5 Folds (When K is set to 5)
# Outputs the object into five pairs (When K = 5) of training and test case files in .cas format
# The input case file object must contain a column called "IDnum"
KfoldCASEFILE<-function(CaseFile, seed,k, path1, path2){
  # Renumber the IDnum column by order IF the case file is missing a number in a sequence
  CaseFile<-IDnumOrdered(CaseFile)
  #1. Setting the folds
  #2. Join the folds to the casefile object by dplyr::innerjoin (by cols are index in folds and idnum in casefile object)
  set.seed(seed)
  Folds<-cvTools::cvFolds(nrow(transect.lookup.table), K = k, type = "random")
  Folds<-data.frame(Folds$subsets, Folds$which) %>%
    dplyr::rename(IDnum =Folds.subsets, Fold = Folds.which) %>%
    dplyr::inner_join(.,CaseFile)

  #Set up training and holdout vectors
  splits<-data_frame(c(1:4, 5),
                     c(1:3,5, 4),
                     c(1:2, 4:5, 3),
                     c(1,3:5,2),
                     c(2:5, 1))
  colnames(splits)<-paste0("split",c(1:5))
  splits$which<-c(rep("training",4),"holdout")
  # set output file paths
  training.casefile.out<- path1
  holdout.casefile.out<-path2
  #Create the case files, 2 for every column in splits except for "which"
  for(i in seq_len(k)){
    vec<-select(splits, i, which) #take the relevent column vectors (split1:5), and the which col
    split.name<-names(splits)[i]
    names(vec)[1]<-"Fold" #rename the first col to allow joining
    training.holdout<-dplyr::inner_join(Folds, vec) # join vec to Folds by Fold (automatically done)
    #Subset folds twice and store inside objects named after sets 1 and 2 respectively
    training<-filter(training.holdout, which == "training") %>%
      select(-Fold,-which)
    holdout<-filter(training.holdout, which == "holdout") %>%
      select(-Fold,-which)
    #Write both of the casefiles
    training.casefile.path<-paste(training.casefile.out,split.name, "training",".cas", sep="" )
    holdout.casefile.path<-paste(holdout.casefile.out,split.name, "holdout",".cas", sep="" )
    CaseFileDelimiter("\t")
    CaseFileMissingCode("*")
    write.table(training, file = training.casefile.path, append = FALSE, quote = FALSE, sep = CaseFileDelimiter(),
                eol = "\n", na = CaseFileMissingCode(), dec = ".", row.names = FALSE,
                col.names = TRUE, qmethod = c("escape", "double"),
                fileEncoding = "")
    write.table(holdout, file = holdout.casefile.path, append = FALSE, quote = FALSE, sep = CaseFileDelimiter(),
                eol = "\n", na = CaseFileMissingCode(), dec = ".", row.names = FALSE,
                col.names = TRUE, qmethod = c("escape", "double"),
                fileEncoding = "")
  }
}
	## FUNCTION - KfoldCASEFILE
	# splits an R object of a case file into 5 Folds (When K is set to 5)
	# Outputs the object into five pairs (When K = 5) of training and test case files in .cas format
	# The input case file object must contain a column called "IDnum"
	KfoldCASEFILE<-function(CaseFile, seed,k, path1, path2){
	# Renumber the IDnum column by order IF the case file is missing a number in a sequence
	CaseFile<-IDnumOrdered(CaseFile)
	#1. Setting the folds
	#2. Join the folds to the casefile object by dplyr::innerjoin (by cols are index in folds and idnum in casefile object)
	set.seed(seed)
	Folds<-cvTools::cvFolds(nrow(transect.lookup.table), K = k, type = "random")
	Folds<-data.frame(Folds$subsets, Folds$which) %>%
	dplyr::rename(IDnum =Folds.subsets, Fold = Folds.which) %>%
	dplyr::inner_join(.,CaseFile)

	#Set up training and holdout vectors
	splits<-data_frame(c(1:4, 5),
	c(1:3,5, 4),
	c(1:2, 4:5, 3),
	c(1,3:5,2),
	c(2:5, 1))
	colnames(splits)<-paste0("split",c(1:5))
	splits$which<-c(rep("training",4),"holdout")
	# set output file paths
	training.casefile.out<- path1
	holdout.casefile.out<-path2
	#Create the case files, 2 for every column in splits except for "which"
	for(i in seq_len(k)){
	vec<-select(splits, i, which) #take the relevent column vectors (split1:5), and the which col
	split.name<-names(splits)[i]
	names(vec)[1]<-"Fold" #rename the first col to allow joining
	training.holdout<-dplyr::inner_join(Folds, vec) # join vec to Folds by Fold (automatically done)
	#Subset folds twice and store inside objects named after sets 1 and 2 respectively
	training<-filter(training.holdout, which == "training") %>%
	select(-Fold,-which)
	holdout<-filter(training.holdout, which == "holdout") %>%
	select(-Fold,-which)
	#Write both of the casefiles
	training.casefile.path<-paste(training.casefile.out,split.name, "training",".cas", sep="" )
	holdout.casefile.path<-paste(holdout.casefile.out,split.name, "holdout",".cas", sep="" )
	CaseFileDelimiter("\t")
	CaseFileMissingCode("*")
	write.table(training, file = training.casefile.path, append = FALSE, quote = FALSE, sep = CaseFileDelimiter(),
	eol = "\n", na = CaseFileMissingCode(), dec = ".", row.names = FALSE,
	col.names = TRUE, qmethod = c("escape", "double"),
	fileEncoding = "")
	write.table(holdout, file = holdout.casefile.path, append = FALSE, quote = FALSE, sep = CaseFileDelimiter(),
	eol = "\n", na = CaseFileMissingCode(), dec = ".", row.names = FALSE,
	col.names = TRUE, qmethod = c("escape", "double"),
	fileEncoding = "")
	}
	}