Skip to content

Instantly share code, notes, and snippets.

@mkiang
Created December 10, 2020 22:35
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mkiang/e5f46e1579894970eb30fa1ae72d7f3d to your computer and use it in GitHub Desktop.
Save mkiang/e5f46e1579894970eb30fa1ae72d7f3d to your computer and use it in GitHub Desktop.
Collaboration network 2020 version
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots
## Imports ----
library(scholar) # remotes::install_github("jkeirstead/scholar")
library(tidyverse)
library(fs)
library(here)
## Constants ----
## Note that to avoid the API rate limiter, we use a sleep timer and draw
## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a *lot* of
## papers, this will take an unreasonably long time. You can tempt fate by
## lowering these bounds, but you do so at your own risk. At the default
## settings, this downloads the author and citation data for each paper at
## ~15 minutes each per paper (~30 minutes total if you need to download
## the author list again).
MIN_TIME <- 60 * 5
MAX_TIME <- 60 * 35
NO_CACHE <- FALSE
my_id <- "eD9_J3wAAAAJ"
## Set up folders ----
fs::dir_create(here::here("plots"))
fs::dir_create(here::here("data"))
fs::dir_create(here::here("temp_files"))
## Helper functions ----
## pub_id can have special characters. Just remove them.
sanitize_text <- function(string) {
gsub("[^[:alnum:]]", "", string)
}
## Wrapper for a sleep timer.
random_sleep <- function(min_time, max_time) {
Sys.sleep(stats::runif(1, min_time, max_time))
}
## Extract a sanitized pubid from a filename
extract_sanitized_pubid <- function(file_name) {
base_f <- basename(file_name)
substr(base_f, 6, nchar(base_f) - 4)
}
## Wrapper for gsub() to fix author names
fix_author_name <- function(df, string1, string2) {
df %>%
dplyr::mutate(all_authors = gsub(string1, string2, all_authors))
}
## Get a list of publications ----
pubs_df <- scholar::get_publications(my_id)
## Loop through every pub and get (1) citation history and (2) full
## author list, skipping over pubs we have already pulled. Note that
## this takes a long time to avoid hitting the rate limit.
for (p in unique(pubs_df$pubid)) {
print(sprintf("Starting (%s): %s", Sys.time(), p))
cite_file <-
here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p)))
auth_file <-
here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p)))
if (!fs::file_exists(cite_file) | NO_CACHE) {
cite_df <- scholar::get_article_cite_history(my_id, p)
saveRDS(cite_df, cite_file)
random_sleep(MIN_TIME, MAX_TIME)
}
if (!fs::file_exists(auth_file) | NO_CACHE) {
auth_df <- scholar::get_complete_authors(my_id, p)
saveRDS(auth_df, auth_file)
random_sleep(MIN_TIME, MAX_TIME)
}
print(sprintf("Completed (%s): %s", Sys.time(), p))
}
## Reshape publication dataframe ----
## We want to reshape the pubs dataframe so that every line is a paper-year
## so we can look at the number of citations per year.
pubs_expanded <- pubs_df %>%
dplyr::mutate(
journal = as.character(journal),
title = as.character(title),
pubid = as.character(pubid)
) %>%
dplyr::select(title, journal, pubid, year) %>%
dplyr::mutate(pub_type = "paper") %>%
dplyr::group_by(title, journal, pubid, pub_type) %>%
tidyr::expand(year = year:2020) %>%
dplyr::ungroup()
## Add author data to dataframe ----
## Make a 2-col dataframe of (sanitized) pubid and author list
authors_df <- purrr::map_df(
.x = fs::dir_ls("./temp_files", regexp = "auth_"),
.f = ~ dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x),
all_authors = readRDS(.x))
)
## Fix author names
authors_df <- authors_df %>%
fix_author_name("Mathew Kiang", "Mathew V Kiang") %>%
fix_author_name("é|\xe9", "e") %>%
fix_author_name("W Moeller", "M Moeller") %>%
fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>%
fix_author_name("Jarvis Chen", "Jarvis T Chen") %>%
fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>%
fix_author_name("Monica Alexander", "Monica J Alexander") %>%
fix_author_name("Lyen Huang", "Lyen C Huang") %>%
fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson") %>%
fix_author_name("Caroline Buckee", "Caroline O Buckee") %>%
fix_author_name("Kenth Eng\xf8-Monsen", "Kenth Engo-Monsen") %>%
fix_author_name("Hern\xe1ndez", "Hernandez")
## Separate out into individual authors
authors_df <- authors_df %>%
tidyr::separate(
all_authors,
into = sprintf("author_%02i", 1:30),
remove = FALSE,
sep = ", ",
fill = "right"
)
## Remove all the completely empty columns
authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0]
## Add citation history to dataframe ----
## Join full dataframe with author dataframe
pubs_expanded <- pubs_expanded %>%
dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>%
dplyr::left_join(authors_df, by = "sanitized_pubid") %>%
dplyr::select(-sanitized_pubid)
## Add in individual year citations
pubs_expanded <- pubs_expanded %>%
dplyr::left_join(
purrr::map_df(
.x = fs::dir_ls("./temp_files", regexp = "cite_"),
.f = ~ readRDS(.x)
) %>%
dplyr::rename(year_cites = cites),
by = c("year", "pubid")
)
## Fill in NAs and get cumulative citations
pubs_expanded <- pubs_expanded %>%
dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>%
dplyr::arrange(title, year) %>%
dplyr::group_by(title, pubid) %>%
dplyr::mutate(cume_cites = cumsum(year_cites)) %>%
dplyr::ungroup() %>%
left_join(pubs_df %>%
select(pubid, total_cites = cites))
## Reshape into long ----
## Now we want to reshape it so it is paper-year-author so that each row
## can be thought of as a node in a bipartite network of paper-authors and
## we can slice the network by years
##
## NOTE: igraph expects first two columns to define an edge list so we
## rearrange columns at the end
pubs_long <- pubs_expanded %>%
dplyr::select(-all_authors) %>%
tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>%
dplyr::filter(!is.na(author)) %>%
dplyr::arrange(year, title, author_position) %>%
dplyr::select(from = title, to = author, dplyr::everything()) %>%
dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9)))
## Save ----
saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS"))
saveRDS(pubs_long, here::here("data", "pubs_long.RDS"))
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots
## Imports ----
library(tidyverse)
library(fs)
library(here)
## Folders ----
dir_create(here("temp_files", "stills"))
## Data ----
pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>%
filter(!(
journal %in% c("APHA's",
"SocArXiv",
"European Population Conference",
"medRxiv",
"PAA"))
)
## Create stills ----
pubs_long <- pubs_long_orig %>%
left_join(
pubs_long_orig %>%
select(pubid, first_year = year) %>%
distinct() %>%
group_by(pubid) %>%
filter(first_year == min(first_year)) %>%
ungroup()
) %>%
mutate(age = (year + 1) - first_year) %>%
filter(to != "Mathew V Kiang") %>%
add_graph_position(seed_x = 5, layout_x = "fr")
## Test the final network layout
plot_network(pubs_long, add_labels = FALSE, dim_old = TRUE)
p_layout <- "
AAB
AAC
DEF
"
## Plot for all years
for (year_x in 2010:2020) {
p_network <- plot_network(pubs_long, year_x = year_x)
# p_papers <- plot_unique_papers(pubs_long, year_x = year_x)
p_papers <- plot_new_papers(pubs_long, year_x = year_x)
# p_authors <- plot_unique_authors(pubs_long, year_x = year_x)
p_authors <- plot_new_authors(pubs_long, year_x = year_x)
# p_cites_by_paper <- plot_each_paper(pubs_long, year_x = year_x)
# p_cites_by_paper <- plot_yearly_distro_cites(pubs_long, year_x = year_x)
p_cites_by_paper <- plot_each_paper_age(pubs_long, year_x = year_x)
p_cume_cites <- plot_cume_cites(pubs_long, year_x = year_x)
p_yearly_cites <- plot_yearly_cites(pubs_long, year_x = year_x)
p_save <- p_network + p_papers + p_authors +
p_cites_by_paper + p_cume_cites + p_yearly_cites +
plot_layout(design = p_layout)
ggsave(
here("temp_files", "stills", sprintf("year_%s.jpg", year_x)),
p_save,
width = 5,
height = 5,
dpi = 300,
scale = 1.5
)
p_network <- plot_network(pubs_long, year_x = year_x, add_labels = TRUE)
p_save <- p_network + p_papers + p_authors +
p_cites_by_paper + p_cume_cites + p_yearly_cites +
plot_layout(design = p_layout)
ggsave(
here("temp_files", "stills", "labeled", sprintf("labeled_year_%s.jpg", year_x)),
p_save,
width = 5,
height = 5,
dpi = 300,
scale = 1.5
)
}
## Combine stills ----
## This assumes you have FFmpeg and ImageMagick installed already
system("convert -resize 100% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo.gif")
system("convert -resize 100% -delay 100 -loop 1 ./temp_files/stills/labeled/*.jpg network_evo_labeled.gif")
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo_half_size.gif")
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/labeled/*.jpg network_evo_labeled_half_size.gif")
## See: https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots
library(here)
source(here("code", "utils.R"))
source(here("code", "mk_nytimes.R"))
## Data ----
pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>%
filter(!(
journal %in% c(
"APHA's",
"SocArXiv",
"European Population Conference",
"medRxiv",
"PAA"
)
))
pubs_long <- pubs_long_orig %>%
left_join(
pubs_long_orig %>%
select(pubid, first_year = year) %>%
distinct() %>%
group_by(pubid) %>%
filter(first_year == min(first_year)) %>%
ungroup()
) %>%
mutate(age = (year + 1) - first_year) %>%
filter(to != "Mathew V Kiang")
## Collaboration timeline
collabs <- pubs_long %>%
group_by(to, year) %>%
summarize(n_pubs = sum(year == first_year)) %>%
left_join(
pubs_long %>%
group_by(to) %>%
summarize(
min_year = min(first_year),
max_year = max(first_year),
n_pubs = n_distinct(pubid)
) %>%
arrange(min_year, desc(n_pubs)) %>%
ungroup() %>%
select(-n_pubs) %>%
mutate(y_val = 1:n()),
by = "to"
) %>%
filter(n_pubs > 0)
collabs_timeline <- ggplot() +
geom_point(
data = collabs %>%
group_by(to) %>%
slice(1),
aes(
x = min_year,
y = y_val,
group = to,
size = n_pubs
),
alpha = .5,
shape = 21,
fill = "black"
) +
geom_point(
data = collabs %>%
group_by(to) %>%
slice(1),
aes(
x = max_year,
y = y_val,
group = to,
size = n_pubs
),
alpha = .5,
shape = 21,
fill = "black"
) +
geom_segment(
data = collabs %>%
group_by(to) %>%
slice(1),
aes(
x = max_year,
xend = min_year,
y = y_val,
yend = y_val,
group = to
),
alpha = .5
) +
geom_point(
data = collabs %>%
filter(year != min_year,
year != max_year),
aes(
x = year,
y = y_val,
group = to,
size = n_pubs
),
position = position_jitter(
width = .1,
height = 0,
seed = 1
),
shape = 21
) +
scale_x_continuous(NULL,
breaks = seq(2010, 2020, 2),
expand = c(.025, 0)) +
scale_y_continuous("Collaborator", expand = c(.025, 0)) +
mk_nytimes(axis.text.y = element_blank(),
panel.grid.major.y = element_blank()) +
scale_size_continuous("Collaborations")
ggsave("./colab_timeline.jpg",
collabs_timeline,
width = 6,
height = 4.5)
ggsave(
"./colab_timeline_labeled.jpg",
collabs_timeline +
geom_label_repel(
data = collabs %>%
group_by(to) %>%
filter(year == max(year)),
aes(x = year, y = y_val, label = to)
),
width = 20,
height = 12
)
## Upset plot
intersections_df <- pubs_long %>%
filter(year == 2020) %>%
select(author = to, paper = pubid) %>%
mutate(coauthor = 1) %>%
spread(author, coauthor, fill = 0) %>%
mutate_if(is.numeric, as.integer)
jpeg(
width = 6 * 300,
height = 4.5 * 300,
filename = "./upset_plot.jpg",
quality = 100,
res = 300
)
upset(
as.data.frame(intersections_df),
nsets = 10,
order.by = "freq",
decreasing = TRUE,
mainbar.y.label = "Intersections of collaborators",
sets.x.label = "Collaborations",
mb.ratio = c(.65, .35)
)
dev.off()
## Collaborator "efficiency"
eff_df <- pubs_long %>%
filter(year == 2020) %>%
group_by(to) %>%
summarize(
most_recent_collab = 2020 - max(first_year),
n_papers = n_distinct(from),
avg_cites = mean(total_cites),
total_cites = sum(total_cites)
)
collab_eff <- ggplot() +
geom_point(
data = eff_df %>%
filter(n_papers > 1),
aes(
x = n_papers,
y = total_cites,
fill = avg_cites,
size = most_recent_collab
),
alpha = .75,
shape = 21
) +
geom_point(
data = eff_df %>%
filter(n_papers == 1),
aes(x = n_papers,
y = total_cites,
size = most_recent_collab),
fill = "black",
alpha = .25,
shape = 21
) +
scale_fill_viridis_b(
"Citations per paper (among those with >1 collaboration)",
# trans = "log1p",
option = "C",
breaks = seq(0, 160, 40),
guide = guide_colorbar(
title.position = "top",
barwidth = unit(10, "cm"),
barheight = unit(.25, "cm")
)
) +
scale_size_continuous(
"Years since last collaboration",
range = c(6, .05),
breaks = c(0, 3, 7, 10),
guide = guide_legend(title.position = "top")
) +
scale_x_continuous(
"Number of papers together",
expand = c(.025, 0),
breaks = 1:12,
labels = c(1, "", "", 4, "", "", 7, "", "", 10, "", "")
) +
scale_y_continuous("Total citations",
expand = c(.025, 0)) +
mk_nytimes(legend.position = "bottom")
ggsave("./collaborator_efficiency.jpg",
collab_eff,
width = 8, height = 5, scale = 1)
ggsave("./collaborator_efficiency_labeled.jpg",
collab_eff +
geom_text_repel(
data = x %>%
filter(n_papers > 1),
aes(x = n_papers,
y = total_cites,
label = to),
force = 10,
max.iter = 10000,
segment.alpha = .6,
size = 3
),
width = 8, height = 5, scale = 1.5)
library(ggplot2)
#' A theme I often use -- NYTimes variation
#'
#' @param ... parameters to pass to theme()
#'
#' @return None
#' @export
#' @import ggplot2
mk_nytimes <- function(...) {
## http://minimaxir.com/2015/02/ggplot-tutorial/
## paste0('https://timogrossenbacher.ch/2016/12/',
## 'beautiful-thematic-maps-with-ggplot2-only/')
## https://github.com/hrbrmstr/hrbrthemes/blob/master/R/theme-ipsum.r
## Colos — stick with the ggplot2() greys
c_bg <- "white"
c_grid <- "grey80"
c_btext <- "grey5"
c_mtext <- "grey30"
# Begin construction of chart
ggplot2::theme_bw(base_size = 12, base_family = "Arial Narrow") +
# Region
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = c_bg, color = c_bg),
plot.background = ggplot2::element_rect(fill = c_bg, color = c_bg),
panel.border = ggplot2::element_rect(color = c_bg)
) +
# Grid
ggplot2::theme(
panel.grid.major = ggplot2::element_line(color = c_grid, size = .25),
panel.grid.minor = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank()
) +
# Legend
ggplot2::theme(
legend.position = c(0, 1),
legend.justification = c(0, 1),
legend.key = ggplot2::element_rect(fill = NA, color = NA),
legend.background = ggplot2::element_rect(fill = "transparent", color = NA),
legend.text = ggplot2::element_text(color = c_mtext)
) +
# Titles, labels, etc.
ggplot2::theme(
plot.title = ggplot2::element_text(
color = c_btext,
vjust = 1.25,
face = "bold",
size = 18
),
axis.text = ggplot2::element_text(size = 10, color = c_mtext),
axis.title.x = ggplot2::element_text(
size = 12,
color = c_mtext,
hjust = 1
),
axis.title.y = ggplot2::element_text(
size = 12,
color = c_mtext,
hjust = 1
)
) +
# Facets
ggplot2::theme(
strip.background = ggplot2::element_rect(fill = c_bg, color = c_btext),
strip.text = ggplot2::element_text(size = 10, color = c_btext)
) +
# Plot margins
ggplot2::theme(plot.margin = ggplot2::unit(c(0.35, 0.2, 0.3, 0.35), "cm")) +
# Additionals
ggplot2::theme(...)
}
library(igraph)
library(ggraph)
library(patchwork)
library(tidyverse)
library(ggrepel)
library(UpSetR)
## Helper functions ----
return_graph_layout <-
function(pubs_long,
year_x = 2020,
layout_x = "fr",
seed_x = 123,
...) {
sub_graph <-
igraph::graph_from_data_frame(pubs_long %>%
filter(year == year_x),
directed = FALSE)
V(sub_graph)$type <- bipartite_mapping(sub_graph)$type
set.seed(seed_x)
sub_graph <- create_layout(sub_graph, layout = layout_x, niter = 5000)
return(sub_graph %>%
select(name, x, y))
}
add_graph_position <- function(pubs_long,
year_x = 2020,
layout_x = "fr",
seed_x = 123) {
pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x = seed_x)
pubs_long %>%
left_join(pos_df %>%
select(
from = name,
from_x = x,
from_y = y
)) %>%
left_join(pos_df %>%
select(
to = name,
to_x = x,
to_y = y
))
}
plot_network <- function(pubs_long, year_x = 2020, add_labels = FALSE, dim_old = TRUE) {
x_range <- range(c(pubs_long$to_x, pubs_long$from_x))
y_range <- range(c(pubs_long$to_y, pubs_long$from_y))
if (dim_old) {
alpha_vals <- c(.05, 1)
} else {
alpha_vals <- c(1, 1)
}
sub_df <- pubs_long %>%
filter(year == year_x) %>%
ungroup()
p1 <- ggplot() +
## Edges
geom_curve(
data = sub_df,
aes(
x = from_x,
y = from_y,
xend = to_x,
yend = to_y,
alpha = (1/(age^5)) * .5
),
curvature = 0
) +
## Papers
geom_point(
data = sub_df,
aes(x = from_x, y = from_y, size = cume_cites, alpha = 1/(age^3)),
shape = 21,
color = "white",
fill = "#377eb8"
) +
## Collaborators
geom_point(
data = sub_df,
aes(x = to_x, y = to_y, alpha = 1/(age^3)),
size = 1.75,
shape = 21,
color = "white",
fill = "#e41a1c"
)
if (add_labels) {
p1 <- p1 +
## Labels
geom_label_repel(
data = pubs_long %>%
group_by(pubid) %>%
filter(year == min(year)) %>%
ungroup() %>%
filter(year == year_x) %>%
select(-starts_with("to"), -author_position) %>%
distinct(),
size = 1.5,
alpha = .6,
aes(x = from_x, y = from_y,
label = stringr::str_wrap(from, 40)),
color = "black"
)
}
p1 +
scale_x_continuous(NULL,
limits = c(floor(x_range[1]), ceiling(x_range[2]))) +
scale_y_continuous(NULL,
limits = c(floor(y_range[1]), ceiling(y_range[2]))) +
scale_size_binned(
"Paper citations",
range = c(.5, 8),
limits = c(0, max(pubs_long$cume_cites)),
breaks = c(5, 25, 50, 100),
guide = guide_bins(title.position = "top",
direction = "horizontal",
label.theme = element_text(size = 6))
) +
scale_alpha_continuous(NULL,
range = alpha_vals,
guide = "none",
limits = c(0, 1)) +
theme_void() +
theme(legend.position = c(1, 0),
legend.justification = c(1, 0),
legend.title = element_text(size = 8)) +
labs(title = year_x)
}
plot_new_authors <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
if (dim_old) {
alpha_vals <- c(.1, 1)
} else {
alpha_vals <- c(1, 1)
}
auths <- tibble()
for (y in sort(unique(pubs_long$year))) {
all_authors_before <- pubs_long %>%
filter(year < y) %>%
pull(to) %>%
unique()
current_authors <- pubs_long %>%
filter(year <= y) %>%
pull(to) %>%
unique()
auths <- bind_rows(
auths,
tibble(
year = y,
new_authors = NROW(setdiff(current_authors, all_authors_before))
)
)
}
ggplot() +
geom_col(
data = auths %>%
filter(year <= year_x) %>%
mutate(age = year_x - year + 1),
aes(x = year, y = new_authors, alpha = 1 / (age)),
color = NA,
fill = "#e41a1c",
size = .8
) +
geom_hline(yintercept = seq(0, 20, 5),
color = "white") +
geom_hline(yintercept = mean(auths$new_authors),
linetype = "dotted",
size = .5,
alpha = .5) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
breaks = seq(0, 20, 5),
limits = c(0,
max(auths$new_authors + 1))) +
scale_alpha_continuous(NULL,
range = alpha_vals,
guide = "none",
limits = c(0, 1)) +
labs(title = "New collaborators") +
theme(plot.title = element_text(size = 9))
}
plot_unique_authors <- function(pubs_long, year_x = 2020) {
auths <- pubs_long %>%
group_by(year) %>%
summarize(unique_auths = n_distinct(to)) %>%
ungroup()
ggplot() +
geom_line(
data = auths,
aes(x = year, y = unique_auths),
color = "black",
alpha = .2
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_auths),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = auths %>%
filter(year <= year_x),
aes(x = year, y = unique_auths),
color = "#e41a1c",
alpha = .8,
size = .8
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_auths),
color = "#e41a1c",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(min(auths$unique_auths) - 1,
max(auths$unique_auths + 1))) +
labs(title = "Unique collaborators") +
theme(plot.title = element_text(size = 9))
}
plot_author_scatter <- function(pubs_long, year_x) {
auths <- pubs_long %>%
group_by(to, year) %>%
summarize(n_papers = n_distinct(pubid),
avg_cites = mean(cume_cites),
total_cites = sum(cume_cites)) %>%
group_by(to) %>%
mutate(age = year - min(year) + 1)
sub_df <- auths %>%
filter(year <= year_x)
ggplot(sub_df,
aes(
x = n_papers,
y = avg_cites,
size = total_cites,
alpha = 1 / age ^ 2
)) +
geom_point() +
theme_classic() +
scale_x_continuous(
"Number of collaborations",
expand = c(0, 0),
limits = c(min(x$n_papers) - .25,
max(x$n_papers) + .25)
) +
scale_y_continuous("Citations per collaboration",
expand = c(.035, 0),
limits = c(0, max(x$avg_cites))) +
scale_alpha_continuous(range = c(.05, 1),
guide = "none") +
scale_size_area("Total citations",
limits = range(x$total_cites)) +
labs(title = "Citations by collaborator") +
theme(plot.title = element_text(size = 9),
axis.title = element_text(size = 8),
legend.position = c(1, 1),
legend.justification = c(1, 1),
legend.title = element_text(size = 8))
}
plot_new_papers <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
if (dim_old) {
alpha_vals <- c(.1, 1)
} else {
alpha_vals <- c(1, 1)
}
auths <- pubs_long %>%
group_by(pubid) %>%
filter(year == min(year)) %>%
group_by(year) %>%
summarize(new_project = n_distinct(from))
ggplot() +
geom_col(
data = auths %>%
filter(year <= year_x) %>%
mutate(age = year_x - year + 1),
aes(
x = year,
y = new_project,
alpha = 1 / (age)
),
color = NA,
fill = "#377eb8",
size = .8
) +
geom_hline(yintercept = seq(0, 8, 2),
color = "white") +
geom_hline(
yintercept = mean(auths$new_project),
linetype = "dotted",
size = .5,
alpha = .5
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_alpha_continuous(NULL,
range = alpha_vals,
guide = "none",
limits = c(0, 1)) +
scale_y_continuous(NULL,
expand = c(0, 0),
breaks = seq(0, 6, 2),
limits = c(0,
max(auths$new_project + .5))) +
labs(title = "New manuscripts") +
theme(plot.title = element_text(size = 9))
}
plot_unique_papers <- function(pubs_long, year_x = 2020) {
auths <- pubs_long %>%
group_by(year) %>%
summarize(unique_papers = n_distinct(from)) %>%
ungroup()
ggplot() +
geom_line(
data = auths,
aes(x = year, y = unique_papers),
color = "black",
alpha = .2
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_papers),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = auths %>%
filter(year <= year_x),
aes(x = year, y = unique_papers),
color = "#377eb8",
alpha = .8,
size = .8
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_papers),
color = "#377eb8",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(min(auths$unique_papers) - 1,
max(auths$unique_papers + 1))) +
labs(title = "Unique papers/projects") +
theme(plot.title = element_text(size = 9))
}
plot_yearly_cites <- function(pubs_long, year_x = 2020) {
cites_df <- pubs_long %>%
select(year, year_cites, paper = from) %>%
distinct() %>%
group_by(year) %>%
summarize(year_cites = sum(year_cites)) %>%
ungroup()
ggplot() +
geom_col(
data = cites_df %>%
filter(year <= year_x),
aes(x = year, y = year_cites),
color = NA,
alpha = .8,
size = .8
) +
geom_hline(yintercept = seq(0, 400, 100),
color = "white") +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
breaks = seq(0, 400, 100),
limits = c(min(cites_df$year_cites) - 1,
max(cites_df$year_cites + 1))) +
labs(title = "New citations") +
theme(plot.title = element_text(size = 9))
}
plot_cume_cites <- function(pubs_long, year_x = 2020) {
cites_df <- pubs_long %>%
select(year, cume_cites, paper = from) %>%
distinct() %>%
group_by(year) %>%
summarize(cume_cites = sum(cume_cites)) %>%
ungroup()
ggplot() +
geom_line(
data = cites_df,
aes(x = year, y = cume_cites),
color = "black",
alpha = .2
) +
geom_point(
data = cites_df %>%
filter(year == year_x),
aes(x = year, y = cume_cites),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = cites_df %>%
filter(year <= year_x),
aes(x = year, y = cume_cites),
color = "black",
alpha = .8,
size = .8
) +
geom_point(
data = cites_df %>%
filter(year == year_x),
aes(x = year, y = cume_cites),
color = "black",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
breaks = seq(0, 1500, 300),
limits = c(min(cites_df$cume_cites) - 1.5,
max(cites_df$cume_cites + 15))) +
labs(title = "Cumulative citations") +
theme(plot.title = element_text(size = 9))
}
plot_each_paper <- function(pubs_long, year_x = 2020) {
sub_df <- pubs_long %>%
filter(year <= year_x)
ggplot() +
geom_line(
data = pubs_long,
aes(x = year, y = cume_cites, group = from),
color = "black",
alpha = .025
) +
geom_point(
data = sub_df %>%
group_by(from) %>%
filter(year %in% range(year)),
aes(x = year, y = cume_cites, group = from),
alpha = .2,
size = .5
) +
geom_line(data = sub_df,
aes(x = year,
y = cume_cites,
group = from),
alpha = .35) +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2009.5, 2020.5),
breaks = c(2010, 2015, 2020)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(
min(pubs_long$cume_cites) - 1.5,
max(pubs_long$cume_cites + 1.5)
)) +
labs(title = "Cumulative citations by paper") +
theme_classic() +
theme(plot.title = element_text(size = 9),
legend.position = "none")
}
plot_each_paper_age <- function(pubs_long, year_x = 2020) {
pubs_long <- pubs_long %>%
group_by(pubid) %>%
mutate(years_since_pub = year - min(year)) %>%
ungroup() %>%
select(years_since_pub, pubid, from, cume_cites, age, year) %>%
distinct()
sub_df <- pubs_long %>%
filter(year <= year_x)
ggplot() +
geom_line(
data = pubs_long,
aes(x = years_since_pub,
y = cume_cites,
group = from),
color = "black",
alpha = .01,
size = .1
) +
geom_point(
data = sub_df %>%
group_by(from) %>%
filter(year %in% range(year)),
aes(x = years_since_pub, y = cume_cites, group = from),
alpha = .3,
size = .3
) +
geom_line(
data = sub_df,
aes(x = years_since_pub,
y = cume_cites,
group = from),
alpha = .25
) +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = range(pubs_long$years_since_pub),
breaks = seq(0, 12, 2)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
# limits = c(0, NA)
limits = c(
min(pubs_long$cume_cites) - 1.5,
max(pubs_long$cume_cites + 1.5)
),
breaks = seq(0, 300, 50)
) +
labs(title = "Cumulative paper citations by age") +
theme_classic() +
theme(plot.title = element_text(size = 9))
}
plot_yearly_distro_cites <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
if (dim_old) {
alpha_vals <- c(.1, 1)
} else {
alpha_vals <- c(1, 1)
}
sub_df <- pubs_long %>%
select(year, year_cites, pubid, age) %>%
distinct()
ggplot() +
geom_line(data = sub_df %>%
filter(year <= year_x) %>%
mutate(age = year_x - year + 1),
aes(x = year_cites, group = year, alpha = 1 / (age^5)),
color = "black",
size = .8,
stat="density",
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, 0),
trans = "log1p",
breaks = c(0, 1, 5, 10, 25, 50, 100),
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(0, 1)) +
scale_alpha_continuous(NULL,
range = alpha_vals,
guide = "none") +
labs(title = "Citations by paper") +
theme(plot.title = element_text(size = 9),
axis.text.y = element_blank())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment