Created
December 10, 2020 22:35
-
-
Save mkiang/e5f46e1579894970eb30fa1ae72d7f3d to your computer and use it in GitHub Desktop.
Collaboration network 2020 version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Visualizing authorship networks: code for this blog post: | |
## https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots | |
## Imports ---- | |
library(scholar) # remotes::install_github("jkeirstead/scholar") | |
library(tidyverse) | |
library(fs) | |
library(here) | |
## Constants ---- | |
## Note that to avoid the API rate limiter, we use a sleep timer and draw | |
## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a *lot* of | |
## papers, this will take an unreasonably long time. You can tempt fate by | |
## lowering these bounds, but you do so at your own risk. At the default | |
## settings, this downloads the author and citation data for each paper at | |
## ~15 minutes each per paper (~30 minutes total if you need to download | |
## the author list again). | |
MIN_TIME <- 60 * 5 | |
MAX_TIME <- 60 * 35 | |
NO_CACHE <- FALSE | |
my_id <- "eD9_J3wAAAAJ" | |
## Set up folders ---- | |
fs::dir_create(here::here("plots")) | |
fs::dir_create(here::here("data")) | |
fs::dir_create(here::here("temp_files")) | |
## Helper functions ---- | |
## pub_id can have special characters. Just remove them. | |
sanitize_text <- function(string) { | |
gsub("[^[:alnum:]]", "", string) | |
} | |
## Wrapper for a sleep timer. | |
random_sleep <- function(min_time, max_time) { | |
Sys.sleep(stats::runif(1, min_time, max_time)) | |
} | |
## Extract a sanitized pubid from a filename | |
extract_sanitized_pubid <- function(file_name) { | |
base_f <- basename(file_name) | |
substr(base_f, 6, nchar(base_f) - 4) | |
} | |
## Wrapper for gsub() to fix author names | |
fix_author_name <- function(df, string1, string2) { | |
df %>% | |
dplyr::mutate(all_authors = gsub(string1, string2, all_authors)) | |
} | |
## Get a list of publications ---- | |
pubs_df <- scholar::get_publications(my_id) | |
## Loop through every pub and get (1) citation history and (2) full | |
## author list, skipping over pubs we have already pulled. Note that | |
## this takes a long time to avoid hitting the rate limit. | |
for (p in unique(pubs_df$pubid)) { | |
print(sprintf("Starting (%s): %s", Sys.time(), p)) | |
cite_file <- | |
here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p))) | |
auth_file <- | |
here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p))) | |
if (!fs::file_exists(cite_file) | NO_CACHE) { | |
cite_df <- scholar::get_article_cite_history(my_id, p) | |
saveRDS(cite_df, cite_file) | |
random_sleep(MIN_TIME, MAX_TIME) | |
} | |
if (!fs::file_exists(auth_file) | NO_CACHE) { | |
auth_df <- scholar::get_complete_authors(my_id, p) | |
saveRDS(auth_df, auth_file) | |
random_sleep(MIN_TIME, MAX_TIME) | |
} | |
print(sprintf("Completed (%s): %s", Sys.time(), p)) | |
} | |
## Reshape publication dataframe ---- | |
## We want to reshape the pubs dataframe so that every line is a paper-year | |
## so we can look at the number of citations per year. | |
pubs_expanded <- pubs_df %>% | |
dplyr::mutate( | |
journal = as.character(journal), | |
title = as.character(title), | |
pubid = as.character(pubid) | |
) %>% | |
dplyr::select(title, journal, pubid, year) %>% | |
dplyr::mutate(pub_type = "paper") %>% | |
dplyr::group_by(title, journal, pubid, pub_type) %>% | |
tidyr::expand(year = year:2020) %>% | |
dplyr::ungroup() | |
## Add author data to dataframe ---- | |
## Make a 2-col dataframe of (sanitized) pubid and author list | |
authors_df <- purrr::map_df( | |
.x = fs::dir_ls("./temp_files", regexp = "auth_"), | |
.f = ~ dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x), | |
all_authors = readRDS(.x)) | |
) | |
## Fix author names | |
authors_df <- authors_df %>% | |
fix_author_name("Mathew Kiang", "Mathew V Kiang") %>% | |
fix_author_name("é|\xe9", "e") %>% | |
fix_author_name("W Moeller", "M Moeller") %>% | |
fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>% | |
fix_author_name("Jarvis Chen", "Jarvis T Chen") %>% | |
fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>% | |
fix_author_name("Monica Alexander", "Monica J Alexander") %>% | |
fix_author_name("Lyen Huang", "Lyen C Huang") %>% | |
fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson") %>% | |
fix_author_name("Caroline Buckee", "Caroline O Buckee") %>% | |
fix_author_name("Kenth Eng\xf8-Monsen", "Kenth Engo-Monsen") %>% | |
fix_author_name("Hern\xe1ndez", "Hernandez") | |
## Separate out into individual authors | |
authors_df <- authors_df %>% | |
tidyr::separate( | |
all_authors, | |
into = sprintf("author_%02i", 1:30), | |
remove = FALSE, | |
sep = ", ", | |
fill = "right" | |
) | |
## Remove all the completely empty columns | |
authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0] | |
## Add citation history to dataframe ---- | |
## Join full dataframe with author dataframe | |
pubs_expanded <- pubs_expanded %>% | |
dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>% | |
dplyr::left_join(authors_df, by = "sanitized_pubid") %>% | |
dplyr::select(-sanitized_pubid) | |
## Add in individual year citations | |
pubs_expanded <- pubs_expanded %>% | |
dplyr::left_join( | |
purrr::map_df( | |
.x = fs::dir_ls("./temp_files", regexp = "cite_"), | |
.f = ~ readRDS(.x) | |
) %>% | |
dplyr::rename(year_cites = cites), | |
by = c("year", "pubid") | |
) | |
## Fill in NAs and get cumulative citations | |
pubs_expanded <- pubs_expanded %>% | |
dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>% | |
dplyr::arrange(title, year) %>% | |
dplyr::group_by(title, pubid) %>% | |
dplyr::mutate(cume_cites = cumsum(year_cites)) %>% | |
dplyr::ungroup() %>% | |
left_join(pubs_df %>% | |
select(pubid, total_cites = cites)) | |
## Reshape into long ---- | |
## Now we want to reshape it so it is paper-year-author so that each row | |
## can be thought of as a node in a bipartite network of paper-authors and | |
## we can slice the network by years | |
## | |
## NOTE: igraph expects first two columns to define an edge list so we | |
## rearrange columns at the end | |
pubs_long <- pubs_expanded %>% | |
dplyr::select(-all_authors) %>% | |
tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>% | |
dplyr::filter(!is.na(author)) %>% | |
dplyr::arrange(year, title, author_position) %>% | |
dplyr::select(from = title, to = author, dplyr::everything()) %>% | |
dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9))) | |
## Save ---- | |
saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS")) | |
saveRDS(pubs_long, here::here("data", "pubs_long.RDS")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Visualizing authorship networks: code for this blog post: | |
## https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots | |
## Imports ---- | |
library(tidyverse) | |
library(fs) | |
library(here) | |
## Folders ---- | |
dir_create(here("temp_files", "stills")) | |
## Data ---- | |
pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>% | |
filter(!( | |
journal %in% c("APHA's", | |
"SocArXiv", | |
"European Population Conference", | |
"medRxiv", | |
"PAA")) | |
) | |
## Create stills ---- | |
pubs_long <- pubs_long_orig %>% | |
left_join( | |
pubs_long_orig %>% | |
select(pubid, first_year = year) %>% | |
distinct() %>% | |
group_by(pubid) %>% | |
filter(first_year == min(first_year)) %>% | |
ungroup() | |
) %>% | |
mutate(age = (year + 1) - first_year) %>% | |
filter(to != "Mathew V Kiang") %>% | |
add_graph_position(seed_x = 5, layout_x = "fr") | |
## Test the final network layout | |
plot_network(pubs_long, add_labels = FALSE, dim_old = TRUE) | |
p_layout <- " | |
AAB | |
AAC | |
DEF | |
" | |
## Plot for all years | |
for (year_x in 2010:2020) { | |
p_network <- plot_network(pubs_long, year_x = year_x) | |
# p_papers <- plot_unique_papers(pubs_long, year_x = year_x) | |
p_papers <- plot_new_papers(pubs_long, year_x = year_x) | |
# p_authors <- plot_unique_authors(pubs_long, year_x = year_x) | |
p_authors <- plot_new_authors(pubs_long, year_x = year_x) | |
# p_cites_by_paper <- plot_each_paper(pubs_long, year_x = year_x) | |
# p_cites_by_paper <- plot_yearly_distro_cites(pubs_long, year_x = year_x) | |
p_cites_by_paper <- plot_each_paper_age(pubs_long, year_x = year_x) | |
p_cume_cites <- plot_cume_cites(pubs_long, year_x = year_x) | |
p_yearly_cites <- plot_yearly_cites(pubs_long, year_x = year_x) | |
p_save <- p_network + p_papers + p_authors + | |
p_cites_by_paper + p_cume_cites + p_yearly_cites + | |
plot_layout(design = p_layout) | |
ggsave( | |
here("temp_files", "stills", sprintf("year_%s.jpg", year_x)), | |
p_save, | |
width = 5, | |
height = 5, | |
dpi = 300, | |
scale = 1.5 | |
) | |
p_network <- plot_network(pubs_long, year_x = year_x, add_labels = TRUE) | |
p_save <- p_network + p_papers + p_authors + | |
p_cites_by_paper + p_cume_cites + p_yearly_cites + | |
plot_layout(design = p_layout) | |
ggsave( | |
here("temp_files", "stills", "labeled", sprintf("labeled_year_%s.jpg", year_x)), | |
p_save, | |
width = 5, | |
height = 5, | |
dpi = 300, | |
scale = 1.5 | |
) | |
} | |
## Combine stills ---- | |
## This assumes you have FFmpeg and ImageMagick installed already | |
system("convert -resize 100% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo.gif") | |
system("convert -resize 100% -delay 100 -loop 1 ./temp_files/stills/labeled/*.jpg network_evo_labeled.gif") | |
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo_half_size.gif") | |
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/labeled/*.jpg network_evo_labeled_half_size.gif") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## See: https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots | |
library(here) | |
source(here("code", "utils.R")) | |
source(here("code", "mk_nytimes.R")) | |
## Data ---- | |
pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>% | |
filter(!( | |
journal %in% c( | |
"APHA's", | |
"SocArXiv", | |
"European Population Conference", | |
"medRxiv", | |
"PAA" | |
) | |
)) | |
pubs_long <- pubs_long_orig %>% | |
left_join( | |
pubs_long_orig %>% | |
select(pubid, first_year = year) %>% | |
distinct() %>% | |
group_by(pubid) %>% | |
filter(first_year == min(first_year)) %>% | |
ungroup() | |
) %>% | |
mutate(age = (year + 1) - first_year) %>% | |
filter(to != "Mathew V Kiang") | |
## Collaboration timeline | |
collabs <- pubs_long %>% | |
group_by(to, year) %>% | |
summarize(n_pubs = sum(year == first_year)) %>% | |
left_join( | |
pubs_long %>% | |
group_by(to) %>% | |
summarize( | |
min_year = min(first_year), | |
max_year = max(first_year), | |
n_pubs = n_distinct(pubid) | |
) %>% | |
arrange(min_year, desc(n_pubs)) %>% | |
ungroup() %>% | |
select(-n_pubs) %>% | |
mutate(y_val = 1:n()), | |
by = "to" | |
) %>% | |
filter(n_pubs > 0) | |
collabs_timeline <- ggplot() + | |
geom_point( | |
data = collabs %>% | |
group_by(to) %>% | |
slice(1), | |
aes( | |
x = min_year, | |
y = y_val, | |
group = to, | |
size = n_pubs | |
), | |
alpha = .5, | |
shape = 21, | |
fill = "black" | |
) + | |
geom_point( | |
data = collabs %>% | |
group_by(to) %>% | |
slice(1), | |
aes( | |
x = max_year, | |
y = y_val, | |
group = to, | |
size = n_pubs | |
), | |
alpha = .5, | |
shape = 21, | |
fill = "black" | |
) + | |
geom_segment( | |
data = collabs %>% | |
group_by(to) %>% | |
slice(1), | |
aes( | |
x = max_year, | |
xend = min_year, | |
y = y_val, | |
yend = y_val, | |
group = to | |
), | |
alpha = .5 | |
) + | |
geom_point( | |
data = collabs %>% | |
filter(year != min_year, | |
year != max_year), | |
aes( | |
x = year, | |
y = y_val, | |
group = to, | |
size = n_pubs | |
), | |
position = position_jitter( | |
width = .1, | |
height = 0, | |
seed = 1 | |
), | |
shape = 21 | |
) + | |
scale_x_continuous(NULL, | |
breaks = seq(2010, 2020, 2), | |
expand = c(.025, 0)) + | |
scale_y_continuous("Collaborator", expand = c(.025, 0)) + | |
mk_nytimes(axis.text.y = element_blank(), | |
panel.grid.major.y = element_blank()) + | |
scale_size_continuous("Collaborations") | |
ggsave("./colab_timeline.jpg", | |
collabs_timeline, | |
width = 6, | |
height = 4.5) | |
ggsave( | |
"./colab_timeline_labeled.jpg", | |
collabs_timeline + | |
geom_label_repel( | |
data = collabs %>% | |
group_by(to) %>% | |
filter(year == max(year)), | |
aes(x = year, y = y_val, label = to) | |
), | |
width = 20, | |
height = 12 | |
) | |
## Upset plot | |
intersections_df <- pubs_long %>% | |
filter(year == 2020) %>% | |
select(author = to, paper = pubid) %>% | |
mutate(coauthor = 1) %>% | |
spread(author, coauthor, fill = 0) %>% | |
mutate_if(is.numeric, as.integer) | |
jpeg( | |
width = 6 * 300, | |
height = 4.5 * 300, | |
filename = "./upset_plot.jpg", | |
quality = 100, | |
res = 300 | |
) | |
upset( | |
as.data.frame(intersections_df), | |
nsets = 10, | |
order.by = "freq", | |
decreasing = TRUE, | |
mainbar.y.label = "Intersections of collaborators", | |
sets.x.label = "Collaborations", | |
mb.ratio = c(.65, .35) | |
) | |
dev.off() | |
## Collaborator "efficiency" | |
eff_df <- pubs_long %>% | |
filter(year == 2020) %>% | |
group_by(to) %>% | |
summarize( | |
most_recent_collab = 2020 - max(first_year), | |
n_papers = n_distinct(from), | |
avg_cites = mean(total_cites), | |
total_cites = sum(total_cites) | |
) | |
collab_eff <- ggplot() + | |
geom_point( | |
data = eff_df %>% | |
filter(n_papers > 1), | |
aes( | |
x = n_papers, | |
y = total_cites, | |
fill = avg_cites, | |
size = most_recent_collab | |
), | |
alpha = .75, | |
shape = 21 | |
) + | |
geom_point( | |
data = eff_df %>% | |
filter(n_papers == 1), | |
aes(x = n_papers, | |
y = total_cites, | |
size = most_recent_collab), | |
fill = "black", | |
alpha = .25, | |
shape = 21 | |
) + | |
scale_fill_viridis_b( | |
"Citations per paper (among those with >1 collaboration)", | |
# trans = "log1p", | |
option = "C", | |
breaks = seq(0, 160, 40), | |
guide = guide_colorbar( | |
title.position = "top", | |
barwidth = unit(10, "cm"), | |
barheight = unit(.25, "cm") | |
) | |
) + | |
scale_size_continuous( | |
"Years since last collaboration", | |
range = c(6, .05), | |
breaks = c(0, 3, 7, 10), | |
guide = guide_legend(title.position = "top") | |
) + | |
scale_x_continuous( | |
"Number of papers together", | |
expand = c(.025, 0), | |
breaks = 1:12, | |
labels = c(1, "", "", 4, "", "", 7, "", "", 10, "", "") | |
) + | |
scale_y_continuous("Total citations", | |
expand = c(.025, 0)) + | |
mk_nytimes(legend.position = "bottom") | |
ggsave("./collaborator_efficiency.jpg", | |
collab_eff, | |
width = 8, height = 5, scale = 1) | |
ggsave("./collaborator_efficiency_labeled.jpg", | |
collab_eff + | |
geom_text_repel( | |
data = x %>% | |
filter(n_papers > 1), | |
aes(x = n_papers, | |
y = total_cites, | |
label = to), | |
force = 10, | |
max.iter = 10000, | |
segment.alpha = .6, | |
size = 3 | |
), | |
width = 8, height = 5, scale = 1.5) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
#' A theme I often use -- NYTimes variation | |
#' | |
#' @param ... parameters to pass to theme() | |
#' | |
#' @return None | |
#' @export | |
#' @import ggplot2 | |
mk_nytimes <- function(...) { | |
## http://minimaxir.com/2015/02/ggplot-tutorial/ | |
## paste0('https://timogrossenbacher.ch/2016/12/', | |
## 'beautiful-thematic-maps-with-ggplot2-only/') | |
## https://github.com/hrbrmstr/hrbrthemes/blob/master/R/theme-ipsum.r | |
## Colos — stick with the ggplot2() greys | |
c_bg <- "white" | |
c_grid <- "grey80" | |
c_btext <- "grey5" | |
c_mtext <- "grey30" | |
# Begin construction of chart | |
ggplot2::theme_bw(base_size = 12, base_family = "Arial Narrow") + | |
# Region | |
ggplot2::theme( | |
panel.background = ggplot2::element_rect(fill = c_bg, color = c_bg), | |
plot.background = ggplot2::element_rect(fill = c_bg, color = c_bg), | |
panel.border = ggplot2::element_rect(color = c_bg) | |
) + | |
# Grid | |
ggplot2::theme( | |
panel.grid.major = ggplot2::element_line(color = c_grid, size = .25), | |
panel.grid.minor = ggplot2::element_blank(), | |
axis.ticks = ggplot2::element_blank() | |
) + | |
# Legend | |
ggplot2::theme( | |
legend.position = c(0, 1), | |
legend.justification = c(0, 1), | |
legend.key = ggplot2::element_rect(fill = NA, color = NA), | |
legend.background = ggplot2::element_rect(fill = "transparent", color = NA), | |
legend.text = ggplot2::element_text(color = c_mtext) | |
) + | |
# Titles, labels, etc. | |
ggplot2::theme( | |
plot.title = ggplot2::element_text( | |
color = c_btext, | |
vjust = 1.25, | |
face = "bold", | |
size = 18 | |
), | |
axis.text = ggplot2::element_text(size = 10, color = c_mtext), | |
axis.title.x = ggplot2::element_text( | |
size = 12, | |
color = c_mtext, | |
hjust = 1 | |
), | |
axis.title.y = ggplot2::element_text( | |
size = 12, | |
color = c_mtext, | |
hjust = 1 | |
) | |
) + | |
# Facets | |
ggplot2::theme( | |
strip.background = ggplot2::element_rect(fill = c_bg, color = c_btext), | |
strip.text = ggplot2::element_text(size = 10, color = c_btext) | |
) + | |
# Plot margins | |
ggplot2::theme(plot.margin = ggplot2::unit(c(0.35, 0.2, 0.3, 0.35), "cm")) + | |
# Additionals | |
ggplot2::theme(...) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(igraph) | |
library(ggraph) | |
library(patchwork) | |
library(tidyverse) | |
library(ggrepel) | |
library(UpSetR) | |
## Helper functions ---- | |
return_graph_layout <- | |
function(pubs_long, | |
year_x = 2020, | |
layout_x = "fr", | |
seed_x = 123, | |
...) { | |
sub_graph <- | |
igraph::graph_from_data_frame(pubs_long %>% | |
filter(year == year_x), | |
directed = FALSE) | |
V(sub_graph)$type <- bipartite_mapping(sub_graph)$type | |
set.seed(seed_x) | |
sub_graph <- create_layout(sub_graph, layout = layout_x, niter = 5000) | |
return(sub_graph %>% | |
select(name, x, y)) | |
} | |
add_graph_position <- function(pubs_long, | |
year_x = 2020, | |
layout_x = "fr", | |
seed_x = 123) { | |
pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x = seed_x) | |
pubs_long %>% | |
left_join(pos_df %>% | |
select( | |
from = name, | |
from_x = x, | |
from_y = y | |
)) %>% | |
left_join(pos_df %>% | |
select( | |
to = name, | |
to_x = x, | |
to_y = y | |
)) | |
} | |
plot_network <- function(pubs_long, year_x = 2020, add_labels = FALSE, dim_old = TRUE) { | |
x_range <- range(c(pubs_long$to_x, pubs_long$from_x)) | |
y_range <- range(c(pubs_long$to_y, pubs_long$from_y)) | |
if (dim_old) { | |
alpha_vals <- c(.05, 1) | |
} else { | |
alpha_vals <- c(1, 1) | |
} | |
sub_df <- pubs_long %>% | |
filter(year == year_x) %>% | |
ungroup() | |
p1 <- ggplot() + | |
## Edges | |
geom_curve( | |
data = sub_df, | |
aes( | |
x = from_x, | |
y = from_y, | |
xend = to_x, | |
yend = to_y, | |
alpha = (1/(age^5)) * .5 | |
), | |
curvature = 0 | |
) + | |
## Papers | |
geom_point( | |
data = sub_df, | |
aes(x = from_x, y = from_y, size = cume_cites, alpha = 1/(age^3)), | |
shape = 21, | |
color = "white", | |
fill = "#377eb8" | |
) + | |
## Collaborators | |
geom_point( | |
data = sub_df, | |
aes(x = to_x, y = to_y, alpha = 1/(age^3)), | |
size = 1.75, | |
shape = 21, | |
color = "white", | |
fill = "#e41a1c" | |
) | |
if (add_labels) { | |
p1 <- p1 + | |
## Labels | |
geom_label_repel( | |
data = pubs_long %>% | |
group_by(pubid) %>% | |
filter(year == min(year)) %>% | |
ungroup() %>% | |
filter(year == year_x) %>% | |
select(-starts_with("to"), -author_position) %>% | |
distinct(), | |
size = 1.5, | |
alpha = .6, | |
aes(x = from_x, y = from_y, | |
label = stringr::str_wrap(from, 40)), | |
color = "black" | |
) | |
} | |
p1 + | |
scale_x_continuous(NULL, | |
limits = c(floor(x_range[1]), ceiling(x_range[2]))) + | |
scale_y_continuous(NULL, | |
limits = c(floor(y_range[1]), ceiling(y_range[2]))) + | |
scale_size_binned( | |
"Paper citations", | |
range = c(.5, 8), | |
limits = c(0, max(pubs_long$cume_cites)), | |
breaks = c(5, 25, 50, 100), | |
guide = guide_bins(title.position = "top", | |
direction = "horizontal", | |
label.theme = element_text(size = 6)) | |
) + | |
scale_alpha_continuous(NULL, | |
range = alpha_vals, | |
guide = "none", | |
limits = c(0, 1)) + | |
theme_void() + | |
theme(legend.position = c(1, 0), | |
legend.justification = c(1, 0), | |
legend.title = element_text(size = 8)) + | |
labs(title = year_x) | |
} | |
plot_new_authors <- function(pubs_long, year_x = 2020, dim_old = TRUE) { | |
if (dim_old) { | |
alpha_vals <- c(.1, 1) | |
} else { | |
alpha_vals <- c(1, 1) | |
} | |
auths <- tibble() | |
for (y in sort(unique(pubs_long$year))) { | |
all_authors_before <- pubs_long %>% | |
filter(year < y) %>% | |
pull(to) %>% | |
unique() | |
current_authors <- pubs_long %>% | |
filter(year <= y) %>% | |
pull(to) %>% | |
unique() | |
auths <- bind_rows( | |
auths, | |
tibble( | |
year = y, | |
new_authors = NROW(setdiff(current_authors, all_authors_before)) | |
) | |
) | |
} | |
ggplot() + | |
geom_col( | |
data = auths %>% | |
filter(year <= year_x) %>% | |
mutate(age = year_x - year + 1), | |
aes(x = year, y = new_authors, alpha = 1 / (age)), | |
color = NA, | |
fill = "#e41a1c", | |
size = .8 | |
) + | |
geom_hline(yintercept = seq(0, 20, 5), | |
color = "white") + | |
geom_hline(yintercept = mean(auths$new_authors), | |
linetype = "dotted", | |
size = .5, | |
alpha = .5) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
breaks = seq(0, 20, 5), | |
limits = c(0, | |
max(auths$new_authors + 1))) + | |
scale_alpha_continuous(NULL, | |
range = alpha_vals, | |
guide = "none", | |
limits = c(0, 1)) + | |
labs(title = "New collaborators") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_unique_authors <- function(pubs_long, year_x = 2020) { | |
auths <- pubs_long %>% | |
group_by(year) %>% | |
summarize(unique_auths = n_distinct(to)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = auths, | |
aes(x = year, y = unique_auths), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_auths), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = auths %>% | |
filter(year <= year_x), | |
aes(x = year, y = unique_auths), | |
color = "#e41a1c", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_auths), | |
color = "#e41a1c", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(min(auths$unique_auths) - 1, | |
max(auths$unique_auths + 1))) + | |
labs(title = "Unique collaborators") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_author_scatter <- function(pubs_long, year_x) { | |
auths <- pubs_long %>% | |
group_by(to, year) %>% | |
summarize(n_papers = n_distinct(pubid), | |
avg_cites = mean(cume_cites), | |
total_cites = sum(cume_cites)) %>% | |
group_by(to) %>% | |
mutate(age = year - min(year) + 1) | |
sub_df <- auths %>% | |
filter(year <= year_x) | |
ggplot(sub_df, | |
aes( | |
x = n_papers, | |
y = avg_cites, | |
size = total_cites, | |
alpha = 1 / age ^ 2 | |
)) + | |
geom_point() + | |
theme_classic() + | |
scale_x_continuous( | |
"Number of collaborations", | |
expand = c(0, 0), | |
limits = c(min(x$n_papers) - .25, | |
max(x$n_papers) + .25) | |
) + | |
scale_y_continuous("Citations per collaboration", | |
expand = c(.035, 0), | |
limits = c(0, max(x$avg_cites))) + | |
scale_alpha_continuous(range = c(.05, 1), | |
guide = "none") + | |
scale_size_area("Total citations", | |
limits = range(x$total_cites)) + | |
labs(title = "Citations by collaborator") + | |
theme(plot.title = element_text(size = 9), | |
axis.title = element_text(size = 8), | |
legend.position = c(1, 1), | |
legend.justification = c(1, 1), | |
legend.title = element_text(size = 8)) | |
} | |
plot_new_papers <- function(pubs_long, year_x = 2020, dim_old = TRUE) { | |
if (dim_old) { | |
alpha_vals <- c(.1, 1) | |
} else { | |
alpha_vals <- c(1, 1) | |
} | |
auths <- pubs_long %>% | |
group_by(pubid) %>% | |
filter(year == min(year)) %>% | |
group_by(year) %>% | |
summarize(new_project = n_distinct(from)) | |
ggplot() + | |
geom_col( | |
data = auths %>% | |
filter(year <= year_x) %>% | |
mutate(age = year_x - year + 1), | |
aes( | |
x = year, | |
y = new_project, | |
alpha = 1 / (age) | |
), | |
color = NA, | |
fill = "#377eb8", | |
size = .8 | |
) + | |
geom_hline(yintercept = seq(0, 8, 2), | |
color = "white") + | |
geom_hline( | |
yintercept = mean(auths$new_project), | |
linetype = "dotted", | |
size = .5, | |
alpha = .5 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_alpha_continuous(NULL, | |
range = alpha_vals, | |
guide = "none", | |
limits = c(0, 1)) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
breaks = seq(0, 6, 2), | |
limits = c(0, | |
max(auths$new_project + .5))) + | |
labs(title = "New manuscripts") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_unique_papers <- function(pubs_long, year_x = 2020) { | |
auths <- pubs_long %>% | |
group_by(year) %>% | |
summarize(unique_papers = n_distinct(from)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = auths, | |
aes(x = year, y = unique_papers), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_papers), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = auths %>% | |
filter(year <= year_x), | |
aes(x = year, y = unique_papers), | |
color = "#377eb8", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_papers), | |
color = "#377eb8", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(min(auths$unique_papers) - 1, | |
max(auths$unique_papers + 1))) + | |
labs(title = "Unique papers/projects") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_yearly_cites <- function(pubs_long, year_x = 2020) { | |
cites_df <- pubs_long %>% | |
select(year, year_cites, paper = from) %>% | |
distinct() %>% | |
group_by(year) %>% | |
summarize(year_cites = sum(year_cites)) %>% | |
ungroup() | |
ggplot() + | |
geom_col( | |
data = cites_df %>% | |
filter(year <= year_x), | |
aes(x = year, y = year_cites), | |
color = NA, | |
alpha = .8, | |
size = .8 | |
) + | |
geom_hline(yintercept = seq(0, 400, 100), | |
color = "white") + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
breaks = seq(0, 400, 100), | |
limits = c(min(cites_df$year_cites) - 1, | |
max(cites_df$year_cites + 1))) + | |
labs(title = "New citations") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_cume_cites <- function(pubs_long, year_x = 2020) { | |
cites_df <- pubs_long %>% | |
select(year, cume_cites, paper = from) %>% | |
distinct() %>% | |
group_by(year) %>% | |
summarize(cume_cites = sum(cume_cites)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = cites_df, | |
aes(x = year, y = cume_cites), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = cites_df %>% | |
filter(year == year_x), | |
aes(x = year, y = cume_cites), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = cites_df %>% | |
filter(year <= year_x), | |
aes(x = year, y = cume_cites), | |
color = "black", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = cites_df %>% | |
filter(year == year_x), | |
aes(x = year, y = cume_cites), | |
color = "black", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
breaks = seq(0, 1500, 300), | |
limits = c(min(cites_df$cume_cites) - 1.5, | |
max(cites_df$cume_cites + 15))) + | |
labs(title = "Cumulative citations") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_each_paper <- function(pubs_long, year_x = 2020) { | |
sub_df <- pubs_long %>% | |
filter(year <= year_x) | |
ggplot() + | |
geom_line( | |
data = pubs_long, | |
aes(x = year, y = cume_cites, group = from), | |
color = "black", | |
alpha = .025 | |
) + | |
geom_point( | |
data = sub_df %>% | |
group_by(from) %>% | |
filter(year %in% range(year)), | |
aes(x = year, y = cume_cites, group = from), | |
alpha = .2, | |
size = .5 | |
) + | |
geom_line(data = sub_df, | |
aes(x = year, | |
y = cume_cites, | |
group = from), | |
alpha = .35) + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2009.5, 2020.5), | |
breaks = c(2010, 2015, 2020) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c( | |
min(pubs_long$cume_cites) - 1.5, | |
max(pubs_long$cume_cites + 1.5) | |
)) + | |
labs(title = "Cumulative citations by paper") + | |
theme_classic() + | |
theme(plot.title = element_text(size = 9), | |
legend.position = "none") | |
} | |
plot_each_paper_age <- function(pubs_long, year_x = 2020) { | |
pubs_long <- pubs_long %>% | |
group_by(pubid) %>% | |
mutate(years_since_pub = year - min(year)) %>% | |
ungroup() %>% | |
select(years_since_pub, pubid, from, cume_cites, age, year) %>% | |
distinct() | |
sub_df <- pubs_long %>% | |
filter(year <= year_x) | |
ggplot() + | |
geom_line( | |
data = pubs_long, | |
aes(x = years_since_pub, | |
y = cume_cites, | |
group = from), | |
color = "black", | |
alpha = .01, | |
size = .1 | |
) + | |
geom_point( | |
data = sub_df %>% | |
group_by(from) %>% | |
filter(year %in% range(year)), | |
aes(x = years_since_pub, y = cume_cites, group = from), | |
alpha = .3, | |
size = .3 | |
) + | |
geom_line( | |
data = sub_df, | |
aes(x = years_since_pub, | |
y = cume_cites, | |
group = from), | |
alpha = .25 | |
) + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = range(pubs_long$years_since_pub), | |
breaks = seq(0, 12, 2) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
# limits = c(0, NA) | |
limits = c( | |
min(pubs_long$cume_cites) - 1.5, | |
max(pubs_long$cume_cites + 1.5) | |
), | |
breaks = seq(0, 300, 50) | |
) + | |
labs(title = "Cumulative paper citations by age") + | |
theme_classic() + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_yearly_distro_cites <- function(pubs_long, year_x = 2020, dim_old = TRUE) { | |
if (dim_old) { | |
alpha_vals <- c(.1, 1) | |
} else { | |
alpha_vals <- c(1, 1) | |
} | |
sub_df <- pubs_long %>% | |
select(year, year_cites, pubid, age) %>% | |
distinct() | |
ggplot() + | |
geom_line(data = sub_df %>% | |
filter(year <= year_x) %>% | |
mutate(age = year_x - year + 1), | |
aes(x = year_cites, group = year, alpha = 1 / (age^5)), | |
color = "black", | |
size = .8, | |
stat="density", | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, 0), | |
trans = "log1p", | |
breaks = c(0, 1, 5, 10, 25, 50, 100), | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(0, 1)) + | |
scale_alpha_continuous(NULL, | |
range = alpha_vals, | |
guide = "none") + | |
labs(title = "Citations by paper") + | |
theme(plot.title = element_text(size = 9), | |
axis.text.y = element_blank()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment