RyanHope/split_apply_combine_answers.md

## split_apply_combine_answers.md

      
    Raw
  

              split_apply_combine_answers.md
            
          
    Assignments

Problem 1
Using the iris dataset as input, write a function that computes and
returns the mean, std.dev and group size of Sepal.Length for each
Species split by Petal.Length > CRITICAL_LENGTH where
CRITICAL_LENGTH is the second input variable of you function. The
output should be a data.frame and the column names should be meaningful.
You must use split and apply (or sapply/lappy). No for-loops allowed.
Use as few temporary variable as possible.
p1f.a <- function(x, crit) {
  out = do.call(rbind,lapply(split(x, x$Species), function(xx) {
    do.call(rbind,lapply(split(xx, factor(xx$Petal.Length>crit,levels=c(F,T))),function(xxx) {
      data.frame(
        mean(xxx$Sepal.Length),
        sd(xxx$Sepal.Length)
      )
    }))
  }))
  out = cbind(do.call(rbind,strsplit(rownames(out),"[.]")), out)
  colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
  rownames(out) <- NULL
  out
}
p1f.a(iris,1.3)

##      Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1     setosa            FALSE          4.872727       0.5081159
## 2     setosa             TRUE          5.043590       0.2927140
## 3 versicolor            FALSE               NaN              NA
## 4 versicolor             TRUE          5.936000       0.5161711
## 5  virginica            FALSE               NaN              NA
## 6  virginica             TRUE          6.588000       0.6358796

p1f.b <- function(x, crit) {
  x$tmpFactor = paste(x$Species,x$Petal.Length>crit,sep=".")
  x$tmpFactor = factor(x$tmpFactor,levels=unlist(sapply(unique(x$Species),function(xx,z){lapply(xx,paste,z,sep=".")},c(F,T))))
  out = do.call(rbind,lapply(split(x,x$tmpFactor),function(xx) {
    data.frame(
      mean(xx$Sepal.Length),
      sd(xx$Sepal.Length)
    )
  }))
  out = cbind(do.call(rbind,strsplit(rownames(out),"[.]")), out)
  colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
  rownames(out) <- NULL
  out
}
p1f.b(iris,1.3)

##      Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1     setosa            FALSE          4.872727       0.5081159
## 2     setosa             TRUE          5.043590       0.2927140
## 3 versicolor            FALSE               NaN              NA
## 4 versicolor             TRUE          5.936000       0.5161711
## 5  virginica            FALSE               NaN              NA
## 6  virginica             TRUE          6.588000       0.6358796

Problem 2
Rewrite your function from Problem 1 using aggregate instead of
split and apply. Use as few temporary variable as possible.
p2f <- function(x, crit) {
  out = aggregate(Sepal.Length~Species+factor(Petal.Length>crit),x,function(xx){
    cbind(mean(xx),sd(xx))
  })
  out = cbind(out[,1:2],out[,3])
  colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
  tmp = data.frame(rep(unique(as.character(x$Species)),each=2),rep(c(F,T),length(unique(x$Species))))
  colnames(tmp) <- colnames(out)[1:2]
  merge(out,tmp,all=T)
}
p2f(iris,1.3)

##      Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1     setosa            FALSE          4.872727       0.5081159
## 2     setosa             TRUE          5.043590       0.2927140
## 3 versicolor            FALSE                NA              NA
## 4 versicolor             TRUE          5.936000       0.5161711
## 5  virginica            FALSE                NA              NA
## 6  virginica             TRUE          6.588000       0.6358796

Problem 3
Rewrite your function from Problem 1 using ddply instead of split
and apply. Use as few temporary variable as possible.
p3f <- function(x, crit) {
  require(plyr)
  out = ddply(x,.(Species,Petal.Length>crit),function(xx) {
    data.frame(mean(xx$Sepal.Length),sd(xx$Sepal.Length))
  },.drop=F)
  colnames(out) <- c("Species",paste("Petal.Length",crit,sep=">"),"Sepal.Length.Mean","Sepal.Length.SD")
  out
}
p3f(iris,1.3)

## Loading required package: plyr

##      Species Petal.Length>1.3 Sepal.Length.Mean Sepal.Length.SD
## 1     setosa            FALSE          4.872727       0.5081159
## 2     setosa             TRUE          5.043590       0.2927140
## 3 versicolor            FALSE               NaN              NA
## 4 versicolor             TRUE          5.936000       0.5161711
## 5  virginica            FALSE               NaN              NA
## 6  virginica             TRUE          6.588000       0.6358796

Problem 4
Using the mtcars dataset as input, write a function that returns the
mtcars dataset with new columns added. The row value for each new
column should contain average mpg for all cars that share the same
number of transmission type, cylindars, gears, and carbs as the car in
current row.
p4f <- function(x) {
  origcols = colnames(x)
  require(plyr)
  l_ply(c("cyl","am","gear","carb"), function(xx) {
    zz = ddply(x, xx, function(y) {
      mean(y$mpg)
    })
    colnames(zz) = c(xx,paste(xx,"class","average","mpg",sep="."))
    x <<- merge(zz,x)
  })
  newcols = colnames(x)[!(colnames(x) %in% origcols)]
  x[,c(origcols,newcols)]
}
p4f(mtcars)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 2  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 3  32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 5  27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 6  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 7  33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 8  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 9  15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 10 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 11 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 12 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 13 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 14 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 15 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 16 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## 17 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 18 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 19 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 20 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 21 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 22 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 23 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 24 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 25 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 26 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 27 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 28 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 29 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 30 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 31 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 32 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
##    carb.class.average.mpg gear.class.average.mpg am.class.average.mpg
## 1                25.34286               16.10667             17.14737
## 2                25.34286               16.10667             17.14737
## 3                25.34286               24.53333             24.39231
## 4                25.34286               16.10667             17.14737
## 5                25.34286               24.53333             24.39231
## 6                25.34286               24.53333             24.39231
## 7                25.34286               24.53333             24.39231
## 8                22.40000               16.10667             17.14737
## 9                22.40000               16.10667             17.14737
## 10               22.40000               16.10667             17.14737
## 11               22.40000               24.53333             24.39231
## 12               22.40000               24.53333             17.14737
## 13               22.40000               16.10667             17.14737
## 14               22.40000               24.53333             17.14737
## 15               22.40000               21.38000             24.39231
## 16               22.40000               24.53333             24.39231
## 17               22.40000               21.38000             24.39231
## 18               16.30000               16.10667             17.14737
## 19               16.30000               16.10667             17.14737
## 20               16.30000               16.10667             17.14737
## 21               15.79000               16.10667             17.14737
## 22               15.79000               16.10667             17.14737
## 23               15.79000               16.10667             17.14737
## 24               15.79000               24.53333             17.14737
## 25               15.79000               16.10667             17.14737
## 26               15.79000               16.10667             17.14737
## 27               15.79000               24.53333             24.39231
## 28               15.79000               24.53333             24.39231
## 29               15.79000               24.53333             17.14737
## 30               15.79000               21.38000             24.39231
## 31               19.70000               21.38000             24.39231
## 32               15.00000               21.38000             24.39231
##    cyl.class.average.mpg
## 1               26.66364
## 2               19.74286
## 3               26.66364
## 4               19.74286
## 5               26.66364
## 6               26.66364
## 7               26.66364
## 8               15.10000
## 9               15.10000
## 10              15.10000
## 11              26.66364
## 12              26.66364
## 13              15.10000
## 14              26.66364
## 15              26.66364
## 16              26.66364
## 17              26.66364
## 18              15.10000
## 19              15.10000
## 20              15.10000
## 21              15.10000
## 22              15.10000
## 23              15.10000
## 24              19.74286
## 25              15.10000
## 26              15.10000
## 27              19.74286
## 28              19.74286
## 29              19.74286
## 30              15.10000
## 31              19.74286
## 32              15.10000