Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Cross-tabulation

Responses to my tweet: "Help me use dplyr to do this less awkwardly? #rstats"

mtcars %>%
  group_by(cyl) %>%
  summarize(vs0 = sum(vs == 0), vs1 = sum(vs == 1))
## Source: local data frame [3 x 3]
## 
##   cyl vs0 vs1
## 1   4   1  10
## 2   6   3   4
## 3   8  14   0

Later clarified:"imagine vs had 7 levels instead of 2 … how to do for a general factor … general cross-tabulation"

The #rstats twitterati delivered some great answers!

library(dplyr)
library(tidyr)        ## spread()
library(reshape2)     ## dcast(), melt()
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, last

@noamross

mtcars %>%
  group_by(cyl, vs) %>%
  summarize(count = n()) %>%
  spread(vs, count)
## Source: local data frame [3 x 3]
## 
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14 NA

@tjmahr: "skip summarise(count = n()) and use tally() or count()"

mtcars %>%
  group_by(cyl, vs) %>%
  tally() %>%
  spread(vs, n)
## Source: local data frame [3 x 3]
## 
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14 NA

@dev_dmu

mtcars %>%
  count(cyl, vs) %>%
  spread(vs, n)
## Source: local data frame [3 x 3]
## 
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14 NA

@tylerrinker

dcast(melt(mtcars[, c("cyl", "vs")], id = "cyl"), cyl ~ value)
## Aggregation function missing: defaulting to length
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14  0

@daroczig dplyr, reshape2

mtcars %>%
  group_by(cyl, vs) %>%
  summarize(n()) %>%
  dcast(cyl ~ vs)
## Using n() as value column: use value.var to override.
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14 NA

@daroczig data.table, reshape2

mtcarsDT <- mtcars %>% data.table()
dcast(mtcarsDT[, .N, by = list(cyl, vs)], cyl ~ vs)
## Using N as value column: use value.var to override.
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14 NA

@MattDowle data.table, reshape2, %>%

mtcarsDT[, .N, .(cyl, vs)] %>% dcast(cyl ~ vs)
## Using N as value column: use value.var to override.
##   cyl  0  1
## 1   4  1 10
## 2   6  3  4
## 3   8 14 NA

@dev_dmu bonus content: proportions!

mtcars %>%
  count(cyl, vs) %>%
  #mutate(prop = n/sum(n))
  mutate(prop = prop.table(n))
## Source: local data frame [5 x 4]
## Groups: cyl
## 
##   cyl vs  n       prop
## 1   4  0  1 0.09090909
## 2   4  1 10 0.90909091
## 3   6  0  3 0.42857143
## 4   6  1  4 0.57142857
## 5   8  0 14 1.00000000
mtcars %>%
  count(cyl, vs) %>%
  mutate(prop = prop.table(n)) %>%
  select(-n) %>%
  spread(vs, prop)
## Source: local data frame [3 x 3]
## 
##   cyl          0         1
## 1   4 0.09090909 0.9090909
## 2   6 0.42857143 0.5714286
## 3   8 1.00000000        NA

title: "2015-05-15_cross-tabulation.R" author: "jenny" date: "Fri May 15 22:15:58 2015"

#' ---
#' output:
#' html_document:
#' keep_md: TRUE
#' ---
#+ include = FALSE
library(dplyr)
#' Responses to [my
#' tweet](https://twitter.com/JennyBryan/status/599379158452416512):
#' "Help me use dplyr to do this less awkwardly? #rstats"
mtcars %>%
group_by(cyl) %>%
summarize(vs0 = sum(vs == 0), vs1 = sum(vs == 1))
#' Later clarified:"imagine `vs` had 7 levels instead of 2 … how to do for a
#' general factor … general cross-tabulation"
#'
#' The #rstats twitterati delivered some great answers!
library(dplyr)
library(tidyr) ## spread()
library(reshape2) ## dcast(), melt()
library(data.table)
#' @noamross
mtcars %>%
group_by(cyl, vs) %>%
summarize(count = n()) %>%
spread(vs, count)
#' @tjmahr: "skip summarise(count = n()) and use tally() or count()"
mtcars %>%
group_by(cyl, vs) %>%
tally() %>%
spread(vs, n)
#' @dev_dmu
mtcars %>%
count(cyl, vs) %>%
spread(vs, n)
#' @tylerrinker
dcast(melt(mtcars[, c("cyl", "vs")], id = "cyl"), cyl ~ value)
#' @daroczig dplyr, reshape2
mtcars %>%
group_by(cyl, vs) %>%
summarize(n()) %>%
dcast(cyl ~ vs)
#' @daroczig data.table, reshape2
mtcarsDT <- mtcars %>% data.table()
dcast(mtcarsDT[, .N, by = list(cyl, vs)], cyl ~ vs)
#' @MattDowle data.table, reshape2, %>%
mtcarsDT[, .N, .(cyl, vs)] %>% dcast(cyl ~ vs)
#' @dev_dmu bonus content: proportions!
mtcars %>%
count(cyl, vs) %>%
#mutate(prop = n/sum(n))
mutate(prop = prop.table(n))
mtcars %>%
count(cyl, vs) %>%
mutate(prop = prop.table(n)) %>%
select(-n) %>%
spread(vs, prop)
@jennybc
Owner
jennybc commented May 19, 2015

BTW if you prefer 0 to NA above, for the combinations of cyl and vs that do not occur, use the fill = argument to spread():

> mtcars %>%
+     count(cyl, vs) %>%
+     spread(vs, n, fill = 0)
Source: local data frame [3 x 3]

  cyl  0  1
1   4  1 10
2   6  3  4
3   8 14  0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment