Created
July 23, 2015 15:11
-
-
Save egouldo/7ad39a8bb62798d2e2bc to your computer and use it in GitHub Desktop.
This function is used to randomly split a Netica casefile into K splits for K-fold cross validation. K is specified by the user. Seed is also set by the user for reproducibility. Path1 and Path2 are character file paths to the directory of the write.paths for the training and holdout case files, respectively. File path arguments should end with …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## FUNCTION - KfoldCASEFILE | |
# splits an R object of a case file into 5 Folds (When K is set to 5) | |
# Outputs the object into five pairs (When K = 5) of training and test case files in .cas format | |
# The input case file object must contain a column called "IDnum" | |
KfoldCASEFILE<-function(CaseFile, seed,k, path1, path2){ | |
# Renumber the IDnum column by order IF the case file is missing a number in a sequence | |
CaseFile<-IDnumOrdered(CaseFile) | |
#1. Setting the folds | |
#2. Join the folds to the casefile object by dplyr::innerjoin (by cols are index in folds and idnum in casefile object) | |
set.seed(seed) | |
Folds<-cvTools::cvFolds(nrow(transect.lookup.table), K = k, type = "random") | |
Folds<-data.frame(Folds$subsets, Folds$which) %>% | |
dplyr::rename(IDnum =Folds.subsets, Fold = Folds.which) %>% | |
dplyr::inner_join(.,CaseFile) | |
#Set up training and holdout vectors | |
splits<-data_frame(c(1:4, 5), | |
c(1:3,5, 4), | |
c(1:2, 4:5, 3), | |
c(1,3:5,2), | |
c(2:5, 1)) | |
colnames(splits)<-paste0("split",c(1:5)) | |
splits$which<-c(rep("training",4),"holdout") | |
# set output file paths | |
training.casefile.out<- path1 | |
holdout.casefile.out<-path2 | |
#Create the case files, 2 for every column in splits except for "which" | |
for(i in seq_len(k)){ | |
vec<-select(splits, i, which) #take the relevent column vectors (split1:5), and the which col | |
split.name<-names(splits)[i] | |
names(vec)[1]<-"Fold" #rename the first col to allow joining | |
training.holdout<-dplyr::inner_join(Folds, vec) # join vec to Folds by Fold (automatically done) | |
#Subset folds twice and store inside objects named after sets 1 and 2 respectively | |
training<-filter(training.holdout, which == "training") %>% | |
select(-Fold,-which) | |
holdout<-filter(training.holdout, which == "holdout") %>% | |
select(-Fold,-which) | |
#Write both of the casefiles | |
training.casefile.path<-paste(training.casefile.out,split.name, "training",".cas", sep="" ) | |
holdout.casefile.path<-paste(holdout.casefile.out,split.name, "holdout",".cas", sep="" ) | |
CaseFileDelimiter("\t") | |
CaseFileMissingCode("*") | |
write.table(training, file = training.casefile.path, append = FALSE, quote = FALSE, sep = CaseFileDelimiter(), | |
eol = "\n", na = CaseFileMissingCode(), dec = ".", row.names = FALSE, | |
col.names = TRUE, qmethod = c("escape", "double"), | |
fileEncoding = "") | |
write.table(holdout, file = holdout.casefile.path, append = FALSE, quote = FALSE, sep = CaseFileDelimiter(), | |
eol = "\n", na = CaseFileMissingCode(), dec = ".", row.names = FALSE, | |
col.names = TRUE, qmethod = c("escape", "double"), | |
fileEncoding = "") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment