Created
December 7, 2019 07:24
-
-
Save mkiang/8a815911c95c37574ac883261a992fbf to your computer and use it in GitHub Desktop.
Code corresponding to this blog post: https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Visualizing authorship networks: code for this blog post: | |
## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/ | |
## Imports ---- | |
library(scholar) # remotes::install_github("jkeirstead/scholar") | |
library(tidyverse) | |
library(fs) | |
library(here) | |
## Constants ---- | |
## Note that to avoid the API rate limiter, we use a sleep timer and draw | |
## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a *lot* of | |
## papers, this will take an unreasonably long time. You can tempt fate by | |
## lowering these bounds, but you do so at your own risk. At the default | |
## settings, this downloads the author and citation data for each paper at | |
## ~17 minutes per paper (35 minutes total). | |
MIN_TIME <- 60 * 5 | |
MAX_TIME <- 60 * 30 | |
PROJ_WEIGHT <- .8 | |
NODE_WEIGHT <- 1 | |
NO_CACHE <- FALSE | |
my_id <- "eD9_J3wAAAAJ" | |
## Set up folders ---- | |
fs::dir_create(here::here("plots")) | |
fs::dir_create(here::here("data")) | |
fs::dir_create(here::here("temp_files")) | |
## Helper functions ---- | |
## pub_id can have special characters. Just remove them. | |
sanitize_text <- function(string) { | |
gsub("[^[:alnum:]]", "", string) | |
} | |
## Wrapper for a sleep timer. | |
random_sleep <- function(min_time, max_time) { | |
Sys.sleep(stats::runif(1, min_time, max_time)) | |
} | |
## Extract a sanitized pubid from a filename | |
extract_sanitized_pubid <- function(file_name) { | |
base_f <- basename(file_name) | |
substr(base_f, 6, nchar(base_f) - 4) | |
} | |
## Wrapper for gsub() to fix author names | |
fix_author_name <- function(df, string1, string2) { | |
df %>% | |
dplyr::mutate(all_authors = gsub(string1, string2, all_authors)) | |
} | |
## Get a list of publications ---- | |
pubs_df <- scholar::get_publications(my_id) | |
## Loop through every pub and get (1) citation history and (2) full | |
## author list, skipping over pubs we have already pulled. Note that | |
## this takes a long time to avoid hitting the rate limit. | |
for (p in unique(pubs_df$pubid)) { | |
cite_file <- | |
here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p))) | |
auth_file <- | |
here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p))) | |
if (!fs::file_exists(cite_file) | NO_CACHE) { | |
cite_df <- scholar::get_article_cite_history(my_id, p) | |
saveRDS(cite_df, cite_file) | |
random_sleep(MIN_TIME, MAX_TIME) | |
} | |
if (!fs::file_exists(auth_file) | NO_CACHE) { | |
auth_df <- scholar::get_complete_authors(my_id, p) | |
saveRDS(auth_df, auth_file) | |
random_sleep(MIN_TIME, MAX_TIME) | |
} | |
# print(sprintf("Completed: %s", p)) | |
} | |
## Create non-publication dataframes for (1) projects and (2) authors ---- | |
## Add in non-publication projects (e.g., software, data) | |
nonpub_projects <- dplyr::tribble( | |
~ title, | |
~ journal, | |
~ pubid, | |
~ year, | |
~ pub_type, | |
"Narcan (R package)", | |
NA, | |
"narcan", | |
2017, | |
"software", | |
"Beiwe Data Sample", | |
NA, | |
"beiwe_data_sample", | |
2018, | |
"data", | |
"Meta-Beiwe (R Package)", | |
NA, | |
"metabeiwe", | |
2018, | |
"software" | |
) | |
## Each nonpub project needs an author list and matching pubid | |
nonpub_authors <- dplyr::tribble( | |
~ sanitized_pubid, | |
~ all_authors, | |
"narcan", | |
"Mathew V Kiang, Monica J Alexander", | |
"beiwe_data_sample", | |
"Mathew V Kiang, Jeanette Lorme, Jukka-Pekka Onnela", | |
"metabeiwe", | |
"Mathew V Kiang" | |
) | |
## Reshape publication dataframe ---- | |
## We want to reshape the pubs dataframe so that every line is a paper-year | |
## so we can look at the number of citations per year. | |
pubs_expanded <- pubs_df %>% | |
dplyr::mutate( | |
journal = as.character(journal), | |
title = as.character(title), | |
pubid = as.character(pubid) | |
) %>% | |
dplyr::select(title, journal, pubid, year) %>% | |
dplyr::mutate(pub_type = "paper") %>% | |
dplyr::bind_rows(nonpub_projects) %>% | |
dplyr::group_by(title, journal, pubid, pub_type) %>% | |
tidyr::expand(year = year:2019) %>% | |
dplyr::ungroup() | |
## Add author data to dataframe ---- | |
## Make a 2-col dataframe of (sanitized) pubid and author list | |
authors_df <- purrr::map_df( | |
.x = fs::dir_ls("./temp_files", regexp = "auth_"), | |
.f = ~ dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x), | |
all_authors = readRDS(.x)) | |
) %>% | |
dplyr::bind_rows(nonpub_authors) | |
## Fix author names | |
authors_df <- authors_df %>% | |
fix_author_name("é|\xe9", "e") %>% | |
fix_author_name("W Moeller", "M Moeller") %>% | |
fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>% | |
fix_author_name("Jarvis Chen", "Jarvis T Chen") %>% | |
fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>% | |
fix_author_name("Monica Alexander", "Monica J Alexander") %>% | |
fix_author_name("Lyen Huang", "Lyen C Huang") %>% | |
fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson") | |
## Separate out into individual authors | |
authors_df <- authors_df %>% | |
tidyr::separate( | |
all_authors, | |
into = sprintf("author_%02i", 1:20), | |
remove = FALSE, | |
sep = ", ", | |
fill = "right" | |
) | |
## Remove all the completely empty columns | |
authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0] | |
## Add citation history to dataframe ---- | |
## Join full dataframe with author dataframe | |
pubs_expanded <- pubs_expanded %>% | |
dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>% | |
dplyr::left_join(authors_df, by = "sanitized_pubid") %>% | |
dplyr::select(-sanitized_pubid) | |
## Add in individual year citations | |
pubs_expanded <- pubs_expanded %>% | |
dplyr::left_join( | |
purrr::map_df( | |
.x = fs::dir_ls("./temp_files", regexp = "cite_"), | |
.f = ~ readRDS(.x) | |
) %>% | |
dplyr::rename(year_cites = cites), | |
by = c("year", "pubid") | |
) | |
## Fill in NAs and get cumulative citations | |
pubs_expanded <- pubs_expanded %>% | |
dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>% | |
dplyr::arrange(title, year) %>% | |
dplyr::group_by(title, pubid) %>% | |
dplyr::mutate(cume_cites = cumsum(year_cites)) %>% | |
dplyr::ungroup() | |
## Reshape into long ---- | |
## Now we want to reshape it so it is paper-year-author so that each row | |
## can be thought of as a node in a bipartite network of paper-authors and | |
## we can slice the network by years | |
## | |
## NOTE: igraph expects first two columns to define an edge list so we | |
## rearrange columns at the end | |
pubs_long <- pubs_expanded %>% | |
dplyr::select(-all_authors) %>% | |
tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>% | |
dplyr::filter(!is.na(author)) %>% | |
dplyr::arrange(year, title, author_position) %>% | |
dplyr::select(from = title, to = author, dplyr::everything()) %>% | |
dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9))) | |
## Save ---- | |
saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS")) | |
saveRDS(pubs_long, here::here("data", "pubs_long.RDS")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Visualizing authorship networks: code for this blog post: | |
## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/ | |
## Imports ---- | |
library(igraph) | |
library(ggraph) | |
library(patchwork) | |
library(tidyverse) | |
library(fs) | |
library(here) | |
## Folders ---- | |
dir_create(here("temp_files", "stills")) | |
## Data ---- | |
pubs_long <- readRDS(here("data", "pubs_long.RDS")) | |
## Helper functions ---- | |
return_graph_layout <- | |
function(pubs_long, | |
year_x = 2019, | |
layout_x = "fr", | |
seed_x = 12345) { | |
sub_graph <- | |
igraph::graph_from_data_frame(pubs_long %>% | |
filter(year == year_x), | |
directed = FALSE) | |
set.seed(seed_x) | |
sub_graph <- | |
create_layout(sub_graph, layout = layout_x, maxiter = 100) | |
return(sub_graph %>% | |
select(name, x, y)) | |
} | |
add_graph_position <- function(pubs_long, | |
year_x = 2019, | |
layout_x = "fr", | |
seed_x = 12345) { | |
pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x) | |
pubs_long %>% | |
left_join(pos_df %>% | |
select( | |
from = name, | |
from_x = x, | |
from_y = y | |
)) %>% | |
left_join(pos_df %>% | |
select( | |
to = name, | |
to_x = x, | |
to_y = y | |
)) | |
} | |
plot_network <- function(pubs_long, year_x = 2019) { | |
x_range <- range(c(pubs_long$to_x, pubs_long$from_x)) | |
y_range <- range(c(pubs_long$to_y, pubs_long$from_y)) | |
sub_df <- pubs_long %>% | |
add_graph_position() %>% | |
filter(year == year_x) %>% | |
ungroup() | |
ggplot() + | |
## Edges | |
geom_curve( | |
data = sub_df, | |
aes( | |
x = from_x, | |
y = from_y, | |
xend = to_x, | |
yend = to_y | |
), | |
curvature = .1, | |
alpha = .9 | |
) + | |
## Papers | |
geom_point( | |
data = sub_df, | |
aes(x = from_x, y = from_y, size = cume_cites), | |
shape = 21, | |
color = "white", | |
fill = "#377eb8" | |
) + | |
## Collaborators | |
geom_point( | |
data = sub_df, | |
aes(x = to_x, y = to_y), | |
size = 2, | |
shape = 21, | |
color = "white", | |
fill = "#e41a1c" | |
) + | |
scale_x_continuous(NULL, | |
limits = c(floor(x_range[1]), ceiling(x_range[2]))) + | |
scale_y_continuous(NULL, | |
limits = c(floor(y_range[1]), ceiling(y_range[2]))) + | |
scale_size_continuous( | |
"Cumulative\ncitations", | |
range = c(2, 8), | |
limits = c(0, max(pubs_long$cume_cites) + 1), | |
breaks = c(0, 50, 100) | |
) + | |
theme_void() + | |
theme(legend.position = "bottom", | |
legend.title = element_text(size = 8)) + | |
labs(title = year_x) | |
} | |
plot_unique_authors <- function(pubs_long, year_x = 2019) { | |
auths <- pubs_long %>% | |
group_by(year) %>% | |
summarize(unique_auths = n_distinct(to)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = auths, | |
aes(x = year, y = unique_auths), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_auths), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = auths %>% | |
filter(year <= year_x), | |
aes(x = year, y = unique_auths), | |
color = "#e41a1c", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_auths), | |
color = "#e41a1c", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2010, 2019), | |
breaks = c(2010, 2014, 2018) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(min(auths$unique_auths) - 1, | |
max(auths$unique_auths + 1))) + | |
labs(title = "Unique collaborators") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_unique_papers <- function(pubs_long, year_x = 2019) { | |
auths <- pubs_long %>% | |
group_by(year) %>% | |
summarize(unique_papers = n_distinct(from)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = auths, | |
aes(x = year, y = unique_papers), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_papers), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = auths %>% | |
filter(year <= year_x), | |
aes(x = year, y = unique_papers), | |
color = "#377eb8", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = auths %>% | |
filter(year == year_x), | |
aes(x = year, y = unique_papers), | |
color = "#377eb8", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2010, 2019), | |
breaks = c(2010, 2014, 2018) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(min(auths$unique_papers) - 1, | |
max(auths$unique_papers + 1))) + | |
labs(title = "Unique papers/projects") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_yearly_cites <- function(pubs_long, year_x = 2019) { | |
cites_df <- pubs_long %>% | |
select(year, year_cites, paper = from) %>% | |
distinct() %>% | |
group_by(year) %>% | |
summarize(year_cites = sum(year_cites)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = cites_df, | |
aes(x = year, y = year_cites), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = cites_df %>% | |
filter(year == year_x), | |
aes(x = year, y = year_cites), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = cites_df %>% | |
filter(year <= year_x), | |
aes(x = year, y = year_cites), | |
color = "black", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = cites_df %>% | |
filter(year == year_x), | |
aes(x = year, y = year_cites), | |
color = "black", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2010, 2019), | |
breaks = c(2010, 2014, 2018) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(min(cites_df$year_cites) - 1, | |
max(cites_df$year_cites + 1))) + | |
labs(title = "Yearly citations") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_cume_cites <- function(pubs_long, year_x = 2019) { | |
cites_df <- pubs_long %>% | |
select(year, cume_cites, paper = from) %>% | |
distinct() %>% | |
group_by(year) %>% | |
summarize(cume_cites = sum(cume_cites)) %>% | |
ungroup() | |
ggplot() + | |
geom_line( | |
data = cites_df, | |
aes(x = year, y = cume_cites), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = cites_df %>% | |
filter(year == year_x), | |
aes(x = year, y = cume_cites), | |
color = "white", | |
alpha = 1, | |
size = 2 | |
) + | |
geom_line( | |
data = cites_df %>% | |
filter(year <= year_x), | |
aes(x = year, y = cume_cites), | |
color = "black", | |
alpha = .8, | |
size = .8 | |
) + | |
geom_point( | |
data = cites_df %>% | |
filter(year == year_x), | |
aes(x = year, y = cume_cites), | |
color = "black", | |
alpha = 1, | |
size = 3 | |
) + | |
theme_classic() + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2010, 2019), | |
breaks = c(2010, 2014, 2018) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c(min(cites_df$cume_cites) - 1, | |
max(cites_df$cume_cites + 1))) + | |
labs(title = "Cumulative citations") + | |
theme(plot.title = element_text(size = 9)) | |
} | |
plot_each_paper <- function(pubs_long, year_x = 2019) { | |
sub_df <- pubs_long %>% | |
filter(year <= year_x) | |
ggplot() + | |
geom_line( | |
data = pubs_long, | |
aes(x = year, y = cume_cites, group = from), | |
color = "black", | |
alpha = .2 | |
) + | |
geom_point( | |
data = sub_df %>% | |
group_by(from) %>% | |
filter(year %in% range(year)), | |
aes(x = year, y = cume_cites, group = from), | |
alpha = .8 | |
) + | |
geom_line(data = sub_df, | |
aes(x = year, | |
y = cume_cites, | |
group = from), | |
alpha = .8) + | |
scale_x_continuous( | |
NULL, | |
expand = c(0, .25), | |
limits = c(2010, 2019), | |
breaks = c(2010, 2014, 2018) | |
) + | |
scale_y_continuous(NULL, | |
expand = c(0, 0), | |
limits = c( | |
min(pubs_long$cume_cites) - 1, | |
max(pubs_long$cume_cites + 1) | |
)) + | |
labs(title = "Citations by paper") + | |
theme_classic() + | |
theme(plot.title = element_text(size = 9)) | |
} | |
## Create stills ---- | |
pubs_long <- pubs_long %>% | |
filter(to != "Mathew V Kiang") %>% | |
add_graph_position() | |
p_layout <- " | |
AAB | |
AAC | |
DEF | |
" | |
for (year_x in 2010:2019) { | |
p_network <- plot_network(pubs_long, year_x = year_x) | |
p_papers <- plot_unique_papers(pubs_long, year_x = year_x) | |
p_authors <- plot_unique_authors(pubs_long, year_x = year_x) | |
p_cites_by_paper <- plot_each_paper(pubs_long, year_x = year_x) | |
p_cume_cites <- plot_cume_cites(pubs_long, year_x = year_x) | |
p_yearly_cites <- plot_yearly_cites(pubs_long, year_x = year_x) | |
p_save <- p_network + p_papers + p_authors + | |
p_yearly_cites + p_cites_by_paper + | |
p_cume_cites + plot_layout(design = p_layout) | |
ggsave( | |
here("temp_files", "stills", sprintf("year_%s.jpg", year_x)), | |
p_save, | |
width = 5, | |
height = 5, | |
dpi = 300, | |
scale = 1.5 | |
) | |
} | |
## Combine stills ---- | |
## This assumes you have FFmpeg and ImageMagick installed already | |
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo.gif") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment