Skip to content

Instantly share code, notes, and snippets.

@joelkr
Created June 22, 2014 23:11
Show Gist options
  • Save joelkr/503844e2214de02ab91e to your computer and use it in GitHub Desktop.
Save joelkr/503844e2214de02ab91e to your computer and use it in GitHub Desktop.
R data munging file for UC Irvine Cardiac Arrythmia File
cardiac <- read.csv("arrhythmia.data", header=F, na.strings="?")
colnames(cardiac)[1:280] <- c("Age","Gender_Nom","Height","Weight","QRS_Dur",
"P-R_Int","Q-T_Int","T_Int","P_Int","QRS","T","P","QRST","J","Heart_Rate",
"Q_Wave","R_Wave","S_Wave","R_Prime","S_Prime","Int_Def","Rag_R_Nom",
"Diph_R_Nom","Rag_P_Nom","Diph_P_Nom","Rag_T_Nom","Diph_T_Nom",
"DII00", "DII01","DII02", "DII03", "DII04","DII05","DII06","DII07","DII08","DII09","DII10","DII11",
"DIII00","DIII01","DIII02", "DIII03", "DIII04","DIII05","DIII06","DIII07","DIII08","DIII09","DIII10","DIII11",
"AVR00","AVR01","AVR02","AVR03","AVR04","AVR05","AVR06","AVR07","AVR08","AVR09","AVR10","AVR11",
"AVL00","AVRL1","AVL02","AVL03","AVL04","AVL05","AVL06","AVL07","AVL08","AVL09","AVL10","AVL11",
"AVF00","AVF01","AVF02","AVF03","AVF04","AVF05","AVF06","AVF07","AVF08","AVF09","AVF10","AVF11",
"V100","V101","V102","V103","V104","V105","V106","V107","V108","V109","V110","V111",
"V200","V201","V202","V203","V204","V205","V206","V207","V208","V209","V210","V211",
"V300","V301","V302","V303","V304","V305","V306","V307","V308","V309","V310","V311",
"V400","V401","V402","V403","V404","V405","V406","V407","V408","V409","V410","V411",
"V500","V501","V502","V503","V504","V505","V506","V507","V508","V509","V510","V511",
"V600","V601","V602","V603","V604","V605","V606","V607","V608","V609","V610","V611",
"JJ_Wave","Q_Wave","R_Wave","S_Wave","R_Prime_Wave","S_Prime_Wave","P_Wave","T_Wave",
"QRSA","QRSTA",
"DII170","DII171","DII172","DII173","DII174","DII175","DII176","DII177","DII178","DII179",
"DIII180","DIII181","DIII182","DIII183","DIII184","DIII185","DIII186","DIII187","DIII188","DIII189",
"AVR190","AVR191","AVR192","AVR193","AVR194","AVR195","AVR196","AVR197","AVR198","AVR199",
"AVL200","AVL201","AVL202","AVL203","AVL204","AVL205","AVL206","AVL207","AVL208","AVL209",
"AVF210","AVF211","AVF212","AVF213","AVF214","AVF215","AVF216","AVF217","AVF218","AVF219",
"V1220","V1221","V1222","V1223","V1224","V1225","V1226","V1227","V1228","V1229",
"V2230","V2231","V2232","V2233","V2234","V2235","V2236","V2237","V2238","V2239",
"V3240","V3241","V3242","V3243","V3244","V3245","V3246","V3247","V3248","V3249",
"V4250","V4251","V4252","V4253","V4254","V4255","V4256","V4257","V4258","V4259",
"V5260","V5261","V5262","V5263","V5264","V5265","V5266","V5267","V5268","V5269",
"V6270","V6271","V6272","V6273","V6274","V6275","V6276","V6277","V6278","V6279",
"Class_Nom"
)
cardiac_num <- cardiac[,grep("*Nom", colnames(cardiac), invert=T)]
# Some columns have max and min of zero, so are probably useless.
# It is possible that some groups of columns are actually time plots of
# data that pass through zero. Need to plot.
#cardiac_num <- cardiac_num[, apply(cardiac_num, 2, function(x) ! all(x == 0))]
# Clean out NA's from numeric data
# This reduces from 452 to 68 cases, so must find a way to handle missing
# data.
#cardiac_num <- na.omit(cardiac_num)
# We need to separate training and test sets. Would like the same sample
# so set a seed.
set.seed(1234)
index <- 1:nrow(cardiac)
trainindex <- sample(index, trunc(2 * length(index)/3))
cardiac_train <- cardiac[trainindex, ]
cardiac_test <- cardiac[-trainindex, ]
# Save unmodified, train, and test sets
write.table(cardiac, "cardiac.csv", sep=",", row.names=F)
save(cardiac, file="cardiac.RData", compress=TRUE)
write.table(cardiac_train, "cardiac_train.csv", sep=",", row.names=F)
write.table(cardiac_test, "cardiac_test.csv", sep=",", row.names=F)
# Separate train and test numeric data
cardiac_num_train <- cardiac_num[trainindex, ]
cardiac_num_test <- cardiac_num[-trainindex, ]
# Next we must construct a matrix of 0's and 1's with 1's at points where
# we have data, and 0's where we have NA's
# For the method in Coursera Machine Learning, we will need both
# train and test sets to have a matrix like this.
r <- rep(1, times=(nrow(cardiac_num_train)*ncol(cardiac_num_train)))
Rtrain <- matrix(r, nrow=nrow(cardiac_num_train), ncol=ncol(cardiac_num_train))
r <- rep(1, times=(nrow(cardiac_num_test)*ncol(cardiac_num_test)))
Rtest <- matrix(r, nrow=nrow(cardiac_num_test), ncol=ncol(cardiac_num_test))
# Replace 1's with 0's where there are NA's in cardiac data
Rtrain[is.na(cardiac_num_train)] <- 0
Rtest[is.na(cardiac_num_test)] <- 0
# Replace NA's with 0 for numeric data
cardiac_num_train[is.na(cardiac_num_train)] <- 0
# For the method in Coursera Machine Learning, we will need both
# train and test sets to have a matrix like this.
r <- rep(1, times=(nrow(cardiac_num_train)*ncol(cardiac_num_train)))
Rtrain <- matrix(r, nrow=nrow(cardiac_num_train), ncol=ncol(cardiac_num_train))
r <- rep(1, times=(nrow(cardiac_num_test)*ncol(cardiac_num_test)))
Rtest <- matrix(r, nrow=nrow(cardiac_num_test), ncol=ncol(cardiac_num_test))
# Replace 1's with 0's where there are NA's in cardiac data
Rtrain[is.na(cardiac_num_train)] <- 0
Rtest[is.na(cardiac_num_test)] <- 0
# Replace NA's with 0 for numeric data
cardiac_num_train[is.na(cardiac_num_train)] <- 0
cardiac_num_test[is.na(cardiac_num_test)] <- 0
# Write numeric tables
write.table(cardiac_num_train, "cardiac_num_train.csv", sep=",", row.names=F)
write.table(cardiac_num_test, "cardiac_num_test.csv", sep=",", row.names=F)
# Save work in R format
save(cardiac_num_train, cardiac_num_test, Rtrain, Rtest, file="cardiacSetup.rda")
# Clear test data from environment
rm(cardiac, cardiac_num, cardiac_test, cardiac_num_test, trainindex, index,r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment