Problem 1
Using the iris dataset as input, write a function that computes and returns the mean, std.dev and group size of Sepal.Length for each Species split by Petal.Length > CRITICAL_LENGTH where CRITICAL_LENGTH is the second input variable of you function. The output should be a data.frame and the column names should be meaningful. You must use split and apply (or sapply/lappy). No for-loops allowed. Use as few temporary variable as possible.
p1f.a <- function(x, crit) {
out = do.call(rbind,lapply(split(x, x$Species), function(xx) {
do.call(rbind,lapply(split(xx, factor(xx$Petal.Length>crit,levels=c(F,T))),function(xxx) {
data.frame(
mean(xxx$Sepal.Length),
sd(xxx$Sepal.Length)
)
}))
}))
out = cbind(do.call(rbind,strsplit(rownames(out),"[.]")), out)
colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
rownames(out) <- NULL
out
}
p1f.a(iris,1.3)
## Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1 setosa FALSE 4.872727 0.5081159
## 2 setosa TRUE 5.043590 0.2927140
## 3 versicolor FALSE NaN NA
## 4 versicolor TRUE 5.936000 0.5161711
## 5 virginica FALSE NaN NA
## 6 virginica TRUE 6.588000 0.6358796
p1f.b <- function(x, crit) {
x$tmpFactor = paste(x$Species,x$Petal.Length>crit,sep=".")
x$tmpFactor = factor(x$tmpFactor,levels=unlist(sapply(unique(x$Species),function(xx,z){lapply(xx,paste,z,sep=".")},c(F,T))))
out = do.call(rbind,lapply(split(x,x$tmpFactor),function(xx) {
data.frame(
mean(xx$Sepal.Length),
sd(xx$Sepal.Length)
)
}))
out = cbind(do.call(rbind,strsplit(rownames(out),"[.]")), out)
colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
rownames(out) <- NULL
out
}
p1f.b(iris,1.3)
## Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1 setosa FALSE 4.872727 0.5081159
## 2 setosa TRUE 5.043590 0.2927140
## 3 versicolor FALSE NaN NA
## 4 versicolor TRUE 5.936000 0.5161711
## 5 virginica FALSE NaN NA
## 6 virginica TRUE 6.588000 0.6358796
Problem 2
Rewrite your function from Problem 1 using aggregate instead of split and apply. Use as few temporary variable as possible.
p2f <- function(x, crit) {
out = aggregate(Sepal.Length~Species+factor(Petal.Length>crit),x,function(xx){
cbind(mean(xx),sd(xx))
})
out = cbind(out[,1:2],out[,3])
colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
tmp = data.frame(rep(unique(as.character(x$Species)),each=2),rep(c(F,T),length(unique(x$Species))))
colnames(tmp) <- colnames(out)[1:2]
merge(out,tmp,all=T)
}
p2f(iris,1.3)
## Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1 setosa FALSE 4.872727 0.5081159
## 2 setosa TRUE 5.043590 0.2927140
## 3 versicolor FALSE NA NA
## 4 versicolor TRUE 5.936000 0.5161711
## 5 virginica FALSE NA NA
## 6 virginica TRUE 6.588000 0.6358796
Problem 3
Rewrite your function from Problem 1 using ddply instead of split and apply. Use as few temporary variable as possible.
p3f <- function(x, crit) {
require(plyr)
out = ddply(x,.(Species,Petal.Length>crit),function(xx) {
data.frame(mean(xx$Sepal.Length),sd(xx$Sepal.Length))
},.drop=F)
colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
out
}
p3f(iris,1.3)
## Loading required package: plyr
## Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1 setosa FALSE 4.872727 0.5081159
## 2 setosa TRUE 5.043590 0.2927140
## 3 versicolor FALSE NaN NA
## 4 versicolor TRUE 5.936000 0.5161711
## 5 virginica FALSE NaN NA
## 6 virginica TRUE 6.588000 0.6358796
Problem 4
Using the mtcars dataset as input, write a function that returns the mtcars dataset with new columns added. The row value for each new column should contain average mpg for all cars that share the same number of transmission type, cylindars, gears, and carbs as the car in current row.
p4f <- function(x) {
origcols = colnames(x)
require(plyr)
l_ply(c("cyl","am","gear","carb"), function(xx) {
zz = ddply(x, xx, function(y) {
mean(y$mpg)
})
colnames(zz) = c(xx,paste(xx,"class","average","mpg",sep="."))
x <<- merge(zz,x)
})
newcols = colnames(x)[!(colnames(x) %in% origcols)]
x[,c(origcols,newcols)]
}
p4f(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 2 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 3 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 6 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 7 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 8 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 9 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 10 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 11 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 12 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 13 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 14 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 15 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 16 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 17 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 18 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 19 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 20 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 21 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 22 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 23 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 24 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 25 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 26 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 27 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 28 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 29 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 30 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 31 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 32 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## carb.class.average.mpg gear.class.average.mpg am.class.average.mpg
## 1 25.34286 16.10667 17.14737
## 2 25.34286 16.10667 17.14737
## 3 25.34286 24.53333 24.39231
## 4 25.34286 16.10667 17.14737
## 5 25.34286 24.53333 24.39231
## 6 25.34286 24.53333 24.39231
## 7 25.34286 24.53333 24.39231
## 8 22.40000 16.10667 17.14737
## 9 22.40000 16.10667 17.14737
## 10 22.40000 16.10667 17.14737
## 11 22.40000 24.53333 24.39231
## 12 22.40000 24.53333 17.14737
## 13 22.40000 16.10667 17.14737
## 14 22.40000 24.53333 17.14737
## 15 22.40000 21.38000 24.39231
## 16 22.40000 24.53333 24.39231
## 17 22.40000 21.38000 24.39231
## 18 16.30000 16.10667 17.14737
## 19 16.30000 16.10667 17.14737
## 20 16.30000 16.10667 17.14737
## 21 15.79000 16.10667 17.14737
## 22 15.79000 16.10667 17.14737
## 23 15.79000 16.10667 17.14737
## 24 15.79000 24.53333 17.14737
## 25 15.79000 16.10667 17.14737
## 26 15.79000 16.10667 17.14737
## 27 15.79000 24.53333 24.39231
## 28 15.79000 24.53333 24.39231
## 29 15.79000 24.53333 17.14737
## 30 15.79000 21.38000 24.39231
## 31 19.70000 21.38000 24.39231
## 32 15.00000 21.38000 24.39231
## cyl.class.average.mpg
## 1 26.66364
## 2 19.74286
## 3 26.66364
## 4 19.74286
## 5 26.66364
## 6 26.66364
## 7 26.66364
## 8 15.10000
## 9 15.10000
## 10 15.10000
## 11 26.66364
## 12 26.66364
## 13 15.10000
## 14 26.66364
## 15 26.66364
## 16 26.66364
## 17 26.66364
## 18 15.10000
## 19 15.10000
## 20 15.10000
## 21 15.10000
## 22 15.10000
## 23 15.10000
## 24 19.74286
## 25 15.10000
## 26 15.10000
## 27 19.74286
## 28 19.74286
## 29 19.74286
## 30 15.10000
## 31 19.74286
## 32 15.10000