|
# grouping variable used for aggregation |
|
length(unique(sponsorships$uid)) |
|
|
|
[1] 91505 |
|
|
|
# used for %in% search during aggreation |
|
au = unique(sponsors$url) |
|
length(au) |
|
|
|
[1] 1499 |
|
|
|
# base |
|
|
|
assemble = function(sponsorships) { |
|
a = aggregate(name ~ uid, paste0, collapse = ",", data = subset(sponsorships, status == "author")) |
|
b = aggregate(name ~ uid, paste0, collapse = ",", data = subset(sponsorships, status == "cosponsor")) |
|
b = merge(a, b, by = "uid", all.x = TRUE) |
|
a = aggregate(url ~ uid, function(x) { sum(is.na(x)) }, data = sponsorships) |
|
b = merge(a, b, by = "uid", all.x = TRUE) |
|
a = aggregate(url ~ uid, function(x) { x %in% au }, data = subset(sponsorships, !is.na(url))) |
|
b = merge(b, a, by = "uid", all.x = TRUE) |
|
names(b)[-1] = c("n_missing", "auteurs", "cosignataires", "n_known") |
|
return(b[, c("uid", "auteurs", "cosignataires", "n_missing", "n_known")]) |
|
} |
|
|
|
system.time(assemble(sponsorships)) |
|
|
|
user system elapsed |
|
12.397 1.218 13.828 |
|
|
|
# plyr |
|
|
|
library(plyr) |
|
au = unique(sponsors$url) |
|
|
|
system.time(auteurs <- ddply(subset(sponsorships, status %in% c("author", "cosponsor")), .(uid), |
|
summarise, |
|
auteurs = paste0(name[ grepl("author", status) ], collapse = ","), |
|
cosignataires = paste0(name[ grepl("cosponsor", status) ], collapse = ","), |
|
n_missing = sum(is.na(url)), |
|
n_known = sum(url %in% au) |
|
) |
|
) |
|
|
|
user system elapsed |
|
166.870 94.133 264.323 |
|
|
|
# dplyr |
|
|
|
library(dplyr) |
|
|
|
system.time(sponsorships %.% |
|
filter(status %in% c("author", "cosponsor")) %.% |
|
group_by(uid) %.% |
|
summarise( |
|
auteurs = paste0(name[ status == "author" ], collapse = ","), |
|
cosignataires = paste0(name[ status == "cosponsor" ], collapse = ","), |
|
n_missing = sum(is.na(url)), |
|
n_known = sum(url %in% au) |
|
) |
|
) |
|
|
|
user system elapsed |
|
7.191 0.941 8.164 |
|
|
|
# versions |
|
sessionInfo() |
|
R version 3.0.2 (2013-09-25) |
|
Platform: x86_64-apple-darwin10.8.0 (64-bit) |
|
|
|
locale: |
|
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 |
|
|
|
attached base packages: |
|
[1] graphics grDevices datasets stats grid utils methods base |
|
|
|
other attached packages: |
|
[1] data.table_1.9.2 dplyr_0.1.2 XML_3.95-0.2 stringr_0.6.2 RPostgreSQL_0.4 DBI_0.2-7 network_1.9.0 |
|
[8] lubridate_1.3.3 knitr_1.5 ggmap_2.3 GGally_0.4.5 reshape_0.8.4 plyr_1.8.1 downloader_0.3 |
|
[15] devtools_1.4.1 ggplot2_0.9.3.1 |
|
|
|
loaded via a namespace (and not attached): |
|
[1] assertthat_0.1 colorspace_1.2-4 dichromat_2.0-0 digest_0.6.4 evaluate_0.5.1 formatR_0.10 |
|
[7] gtable_0.1.2 httr_0.2 labeling_0.2 mapproj_1.2-2 maps_2.3-6 MASS_7.3-29 |
|
[13] memoise_0.1 munsell_0.4.2 parallel_3.0.2 png_0.1-7 proto_0.3-10 RColorBrewer_1.0-5 |
|
[19] Rcpp_0.11.0 RCurl_1.95-4.1 reshape2_1.2.2 RgoogleMaps_1.2.0.5 rjson_0.2.13 RJSONIO_1.0-3 |
|
[25] scales_0.2.3 tools_3.0.2 whisker_0.3-2 |