Skip to content

Instantly share code, notes, and snippets.

@mkiang
Created December 7, 2019 07:24
Show Gist options
  • Save mkiang/8a815911c95c37574ac883261a992fbf to your computer and use it in GitHub Desktop.
Save mkiang/8a815911c95c37574ac883261a992fbf to your computer and use it in GitHub Desktop.
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/
## Imports ----
library(scholar) # remotes::install_github("jkeirstead/scholar")
library(tidyverse)
library(fs)
library(here)
## Constants ----
## Note that to avoid the API rate limiter, we use a sleep timer and draw
## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a *lot* of
## papers, this will take an unreasonably long time. You can tempt fate by
## lowering these bounds, but you do so at your own risk. At the default
## settings, this downloads the author and citation data for each paper at
## ~17 minutes per paper (35 minutes total).
MIN_TIME <- 60 * 5
MAX_TIME <- 60 * 30
PROJ_WEIGHT <- .8
NODE_WEIGHT <- 1
NO_CACHE <- FALSE
my_id <- "eD9_J3wAAAAJ"
## Set up folders ----
fs::dir_create(here::here("plots"))
fs::dir_create(here::here("data"))
fs::dir_create(here::here("temp_files"))
## Helper functions ----
## pub_id can have special characters. Just remove them.
sanitize_text <- function(string) {
gsub("[^[:alnum:]]", "", string)
}
## Wrapper for a sleep timer.
random_sleep <- function(min_time, max_time) {
Sys.sleep(stats::runif(1, min_time, max_time))
}
## Extract a sanitized pubid from a filename
extract_sanitized_pubid <- function(file_name) {
base_f <- basename(file_name)
substr(base_f, 6, nchar(base_f) - 4)
}
## Wrapper for gsub() to fix author names
fix_author_name <- function(df, string1, string2) {
df %>%
dplyr::mutate(all_authors = gsub(string1, string2, all_authors))
}
## Get a list of publications ----
pubs_df <- scholar::get_publications(my_id)
## Loop through every pub and get (1) citation history and (2) full
## author list, skipping over pubs we have already pulled. Note that
## this takes a long time to avoid hitting the rate limit.
for (p in unique(pubs_df$pubid)) {
cite_file <-
here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p)))
auth_file <-
here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p)))
if (!fs::file_exists(cite_file) | NO_CACHE) {
cite_df <- scholar::get_article_cite_history(my_id, p)
saveRDS(cite_df, cite_file)
random_sleep(MIN_TIME, MAX_TIME)
}
if (!fs::file_exists(auth_file) | NO_CACHE) {
auth_df <- scholar::get_complete_authors(my_id, p)
saveRDS(auth_df, auth_file)
random_sleep(MIN_TIME, MAX_TIME)
}
# print(sprintf("Completed: %s", p))
}
## Create non-publication dataframes for (1) projects and (2) authors ----
## Add in non-publication projects (e.g., software, data)
nonpub_projects <- dplyr::tribble(
~ title,
~ journal,
~ pubid,
~ year,
~ pub_type,
"Narcan (R package)",
NA,
"narcan",
2017,
"software",
"Beiwe Data Sample",
NA,
"beiwe_data_sample",
2018,
"data",
"Meta-Beiwe (R Package)",
NA,
"metabeiwe",
2018,
"software"
)
## Each nonpub project needs an author list and matching pubid
nonpub_authors <- dplyr::tribble(
~ sanitized_pubid,
~ all_authors,
"narcan",
"Mathew V Kiang, Monica J Alexander",
"beiwe_data_sample",
"Mathew V Kiang, Jeanette Lorme, Jukka-Pekka Onnela",
"metabeiwe",
"Mathew V Kiang"
)
## Reshape publication dataframe ----
## We want to reshape the pubs dataframe so that every line is a paper-year
## so we can look at the number of citations per year.
pubs_expanded <- pubs_df %>%
dplyr::mutate(
journal = as.character(journal),
title = as.character(title),
pubid = as.character(pubid)
) %>%
dplyr::select(title, journal, pubid, year) %>%
dplyr::mutate(pub_type = "paper") %>%
dplyr::bind_rows(nonpub_projects) %>%
dplyr::group_by(title, journal, pubid, pub_type) %>%
tidyr::expand(year = year:2019) %>%
dplyr::ungroup()
## Add author data to dataframe ----
## Make a 2-col dataframe of (sanitized) pubid and author list
authors_df <- purrr::map_df(
.x = fs::dir_ls("./temp_files", regexp = "auth_"),
.f = ~ dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x),
all_authors = readRDS(.x))
) %>%
dplyr::bind_rows(nonpub_authors)
## Fix author names
authors_df <- authors_df %>%
fix_author_name("é|\xe9", "e") %>%
fix_author_name("W Moeller", "M Moeller") %>%
fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>%
fix_author_name("Jarvis Chen", "Jarvis T Chen") %>%
fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>%
fix_author_name("Monica Alexander", "Monica J Alexander") %>%
fix_author_name("Lyen Huang", "Lyen C Huang") %>%
fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson")
## Separate out into individual authors
authors_df <- authors_df %>%
tidyr::separate(
all_authors,
into = sprintf("author_%02i", 1:20),
remove = FALSE,
sep = ", ",
fill = "right"
)
## Remove all the completely empty columns
authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0]
## Add citation history to dataframe ----
## Join full dataframe with author dataframe
pubs_expanded <- pubs_expanded %>%
dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>%
dplyr::left_join(authors_df, by = "sanitized_pubid") %>%
dplyr::select(-sanitized_pubid)
## Add in individual year citations
pubs_expanded <- pubs_expanded %>%
dplyr::left_join(
purrr::map_df(
.x = fs::dir_ls("./temp_files", regexp = "cite_"),
.f = ~ readRDS(.x)
) %>%
dplyr::rename(year_cites = cites),
by = c("year", "pubid")
)
## Fill in NAs and get cumulative citations
pubs_expanded <- pubs_expanded %>%
dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>%
dplyr::arrange(title, year) %>%
dplyr::group_by(title, pubid) %>%
dplyr::mutate(cume_cites = cumsum(year_cites)) %>%
dplyr::ungroup()
## Reshape into long ----
## Now we want to reshape it so it is paper-year-author so that each row
## can be thought of as a node in a bipartite network of paper-authors and
## we can slice the network by years
##
## NOTE: igraph expects first two columns to define an edge list so we
## rearrange columns at the end
pubs_long <- pubs_expanded %>%
dplyr::select(-all_authors) %>%
tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>%
dplyr::filter(!is.na(author)) %>%
dplyr::arrange(year, title, author_position) %>%
dplyr::select(from = title, to = author, dplyr::everything()) %>%
dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9)))
## Save ----
saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS"))
saveRDS(pubs_long, here::here("data", "pubs_long.RDS"))
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/
## Imports ----
library(igraph)
library(ggraph)
library(patchwork)
library(tidyverse)
library(fs)
library(here)
## Folders ----
dir_create(here("temp_files", "stills"))
## Data ----
pubs_long <- readRDS(here("data", "pubs_long.RDS"))
## Helper functions ----
return_graph_layout <-
function(pubs_long,
year_x = 2019,
layout_x = "fr",
seed_x = 12345) {
sub_graph <-
igraph::graph_from_data_frame(pubs_long %>%
filter(year == year_x),
directed = FALSE)
set.seed(seed_x)
sub_graph <-
create_layout(sub_graph, layout = layout_x, maxiter = 100)
return(sub_graph %>%
select(name, x, y))
}
add_graph_position <- function(pubs_long,
year_x = 2019,
layout_x = "fr",
seed_x = 12345) {
pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x)
pubs_long %>%
left_join(pos_df %>%
select(
from = name,
from_x = x,
from_y = y
)) %>%
left_join(pos_df %>%
select(
to = name,
to_x = x,
to_y = y
))
}
plot_network <- function(pubs_long, year_x = 2019) {
x_range <- range(c(pubs_long$to_x, pubs_long$from_x))
y_range <- range(c(pubs_long$to_y, pubs_long$from_y))
sub_df <- pubs_long %>%
add_graph_position() %>%
filter(year == year_x) %>%
ungroup()
ggplot() +
## Edges
geom_curve(
data = sub_df,
aes(
x = from_x,
y = from_y,
xend = to_x,
yend = to_y
),
curvature = .1,
alpha = .9
) +
## Papers
geom_point(
data = sub_df,
aes(x = from_x, y = from_y, size = cume_cites),
shape = 21,
color = "white",
fill = "#377eb8"
) +
## Collaborators
geom_point(
data = sub_df,
aes(x = to_x, y = to_y),
size = 2,
shape = 21,
color = "white",
fill = "#e41a1c"
) +
scale_x_continuous(NULL,
limits = c(floor(x_range[1]), ceiling(x_range[2]))) +
scale_y_continuous(NULL,
limits = c(floor(y_range[1]), ceiling(y_range[2]))) +
scale_size_continuous(
"Cumulative\ncitations",
range = c(2, 8),
limits = c(0, max(pubs_long$cume_cites) + 1),
breaks = c(0, 50, 100)
) +
theme_void() +
theme(legend.position = "bottom",
legend.title = element_text(size = 8)) +
labs(title = year_x)
}
plot_unique_authors <- function(pubs_long, year_x = 2019) {
auths <- pubs_long %>%
group_by(year) %>%
summarize(unique_auths = n_distinct(to)) %>%
ungroup()
ggplot() +
geom_line(
data = auths,
aes(x = year, y = unique_auths),
color = "black",
alpha = .2
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_auths),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = auths %>%
filter(year <= year_x),
aes(x = year, y = unique_auths),
color = "#e41a1c",
alpha = .8,
size = .8
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_auths),
color = "#e41a1c",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2010, 2019),
breaks = c(2010, 2014, 2018)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(min(auths$unique_auths) - 1,
max(auths$unique_auths + 1))) +
labs(title = "Unique collaborators") +
theme(plot.title = element_text(size = 9))
}
plot_unique_papers <- function(pubs_long, year_x = 2019) {
auths <- pubs_long %>%
group_by(year) %>%
summarize(unique_papers = n_distinct(from)) %>%
ungroup()
ggplot() +
geom_line(
data = auths,
aes(x = year, y = unique_papers),
color = "black",
alpha = .2
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_papers),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = auths %>%
filter(year <= year_x),
aes(x = year, y = unique_papers),
color = "#377eb8",
alpha = .8,
size = .8
) +
geom_point(
data = auths %>%
filter(year == year_x),
aes(x = year, y = unique_papers),
color = "#377eb8",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2010, 2019),
breaks = c(2010, 2014, 2018)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(min(auths$unique_papers) - 1,
max(auths$unique_papers + 1))) +
labs(title = "Unique papers/projects") +
theme(plot.title = element_text(size = 9))
}
plot_yearly_cites <- function(pubs_long, year_x = 2019) {
cites_df <- pubs_long %>%
select(year, year_cites, paper = from) %>%
distinct() %>%
group_by(year) %>%
summarize(year_cites = sum(year_cites)) %>%
ungroup()
ggplot() +
geom_line(
data = cites_df,
aes(x = year, y = year_cites),
color = "black",
alpha = .2
) +
geom_point(
data = cites_df %>%
filter(year == year_x),
aes(x = year, y = year_cites),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = cites_df %>%
filter(year <= year_x),
aes(x = year, y = year_cites),
color = "black",
alpha = .8,
size = .8
) +
geom_point(
data = cites_df %>%
filter(year == year_x),
aes(x = year, y = year_cites),
color = "black",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2010, 2019),
breaks = c(2010, 2014, 2018)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(min(cites_df$year_cites) - 1,
max(cites_df$year_cites + 1))) +
labs(title = "Yearly citations") +
theme(plot.title = element_text(size = 9))
}
plot_cume_cites <- function(pubs_long, year_x = 2019) {
cites_df <- pubs_long %>%
select(year, cume_cites, paper = from) %>%
distinct() %>%
group_by(year) %>%
summarize(cume_cites = sum(cume_cites)) %>%
ungroup()
ggplot() +
geom_line(
data = cites_df,
aes(x = year, y = cume_cites),
color = "black",
alpha = .2
) +
geom_point(
data = cites_df %>%
filter(year == year_x),
aes(x = year, y = cume_cites),
color = "white",
alpha = 1,
size = 2
) +
geom_line(
data = cites_df %>%
filter(year <= year_x),
aes(x = year, y = cume_cites),
color = "black",
alpha = .8,
size = .8
) +
geom_point(
data = cites_df %>%
filter(year == year_x),
aes(x = year, y = cume_cites),
color = "black",
alpha = 1,
size = 3
) +
theme_classic() +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2010, 2019),
breaks = c(2010, 2014, 2018)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(min(cites_df$cume_cites) - 1,
max(cites_df$cume_cites + 1))) +
labs(title = "Cumulative citations") +
theme(plot.title = element_text(size = 9))
}
plot_each_paper <- function(pubs_long, year_x = 2019) {
sub_df <- pubs_long %>%
filter(year <= year_x)
ggplot() +
geom_line(
data = pubs_long,
aes(x = year, y = cume_cites, group = from),
color = "black",
alpha = .2
) +
geom_point(
data = sub_df %>%
group_by(from) %>%
filter(year %in% range(year)),
aes(x = year, y = cume_cites, group = from),
alpha = .8
) +
geom_line(data = sub_df,
aes(x = year,
y = cume_cites,
group = from),
alpha = .8) +
scale_x_continuous(
NULL,
expand = c(0, .25),
limits = c(2010, 2019),
breaks = c(2010, 2014, 2018)
) +
scale_y_continuous(NULL,
expand = c(0, 0),
limits = c(
min(pubs_long$cume_cites) - 1,
max(pubs_long$cume_cites + 1)
)) +
labs(title = "Citations by paper") +
theme_classic() +
theme(plot.title = element_text(size = 9))
}
## Create stills ----
pubs_long <- pubs_long %>%
filter(to != "Mathew V Kiang") %>%
add_graph_position()
p_layout <- "
AAB
AAC
DEF
"
for (year_x in 2010:2019) {
p_network <- plot_network(pubs_long, year_x = year_x)
p_papers <- plot_unique_papers(pubs_long, year_x = year_x)
p_authors <- plot_unique_authors(pubs_long, year_x = year_x)
p_cites_by_paper <- plot_each_paper(pubs_long, year_x = year_x)
p_cume_cites <- plot_cume_cites(pubs_long, year_x = year_x)
p_yearly_cites <- plot_yearly_cites(pubs_long, year_x = year_x)
p_save <- p_network + p_papers + p_authors +
p_yearly_cites + p_cites_by_paper +
p_cume_cites + plot_layout(design = p_layout)
ggsave(
here("temp_files", "stills", sprintf("year_%s.jpg", year_x)),
p_save,
width = 5,
height = 5,
dpi = 300,
scale = 1.5
)
}
## Combine stills ----
## This assumes you have FFmpeg and ImageMagick installed already
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo.gif")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment