Code to create your collaboration network in R
## Visualizing authorship networks: code for this blog post: | |
## https://mathewkiang.com/2018/06/17/my-collaboration-network/ | |
library(scholar) # devtools::install_github("jkeirstead/scholar") | |
library(visNetwork) | |
library(tidyverse) | |
## Constants | |
MIN_TIME <- 60 * 5 | |
MAX_TIME <- 60 * 30 | |
PROJ_WEIGHT <- .8 | |
NODE_WEIGHT <- 1 | |
my_scholar_id <- "eD9_J3wAAAAJ" | |
## If we've already pulled from Google Scholar, don't do it again. The sleep | |
## timer makes this an unreasonably long process. | |
if (!file.exists("./scholar_pulls.rda")) { | |
## Pull publication history of myself | |
mvk_df <- get_publications(my_scholar_id) | |
## Drop the Opioid Trends preprint so we're not double-counting | |
mvk_df <- mvk_df %>% | |
filter(pubid != "-yGd096yOn8C") %>% | |
mutate_at(vars(one_of("title", "author", "journal", | |
"number", "cid", "pubid")), | |
as.character) | |
## Fix UTF-8 encoding -- once here and once later. | |
mvk_df <- mvk_df %>% | |
mutate(author = gsub("é|\xe9", "e", author)) | |
## The summary df does not have a full author list so we need that. Set a | |
## generous sleep timer to stay below rate limit. It seems like one | |
## pull every 15 to 30 minutes is fairly safe. | |
authors <- NULL | |
pubid <- NULL | |
for (p in mvk_df$pubid) { | |
print(p) | |
authors <- c(authors, getCompleteAuthors(my_scholar_id, p)) | |
pubid <- c(pubid, p) | |
Sys.sleep(runif(1, min = MIN_TIME, max = MAX_TIME)) | |
} | |
## Save it so we don't have to pull again. | |
save(authors, pubid, mvk_df, file = "./scholar_pulls.rda") | |
} | |
## Now munging data | |
load("./scholar_pulls.rda") | |
## Split out every author into their own column | |
full_authors <- tibble(pubid, f_authors = authors) | |
full_authors <- full_authors %>% | |
## Fix encoding | |
mutate(f_authors = gsub("é|\xe9", "e", f_authors)) %>% | |
## Fix Rob's middle initial | |
mutate(f_authors = gsub("W Moeller", "M Moeller", f_authors)) %>% | |
## Fix Jen's middle initial | |
mutate(f_authors = gsub("Jennifer Hayes", "Jennifer E Hayes", | |
f_authors)) %>% | |
separate(f_authors, into = sprintf("author_%02i", 1:30), | |
remove = FALSE, sep = ", ", fill = "right") | |
## Remove all the completely empty columns | |
full_authors <- full_authors[, colSums(!is.na(full_authors)) > 0] | |
## Merge back to original dataframe | |
mvk_df <- mvk_df %>% | |
left_join(full_authors) | |
## Get a list of unique co-authors for the node list | |
distinct_authors <- unique( | |
unlist(strsplit(full_authors$f_authors, ", ", fixed = TRUE)) | |
) | |
distinct_authors <- distinct_authors[distinct_authors != "Mathew V Kiang"] | |
## Now let's make a node list | |
nodes <- bind_rows( | |
mvk_df %>% | |
select(title) %>% | |
mutate(value = PROJ_WEIGHT, | |
group = "paper", | |
id = title), | |
tibble( | |
title = distinct_authors, | |
value = NODE_WEIGHT, | |
group = "coauthor", | |
id = title | |
) | |
) | |
## Add some github side projects | |
nodes <- nodes %>% | |
add_case(title = "narcan", | |
value = PROJ_WEIGHT, | |
group = "software", | |
id = title) %>% | |
add_case(title = "metabeiwe", | |
value = PROJ_WEIGHT, | |
group = "software", | |
id = title) %>% | |
add_case(title = "beiwe_data_sample", | |
value = PROJ_WEIGHT, | |
group = "data", | |
id = title) | |
## Now the edge list -- reshape our merged dataframe into an edgelist | |
## going from paper to co-author | |
edges <- mvk_df %>% | |
select(title, starts_with("author_")) %>% | |
gather(key = "author", value = "from", starts_with("author_")) %>% | |
filter(!is.na(from), from != "Mathew V Kiang") %>% | |
select(to = title, from) | |
## Add cases for the non-papers | |
edges <- edges %>% | |
add_case(to = "narcan", from = "Monica J Alexander") %>% | |
add_case(to = "beiwe_data_sample", from = "Jukka-Pekka Onnela") %>% | |
add_case(to = "beiwe_data_sample", from = "Jeanette Lorme") | |
visNetwork(nodes, edges) %>% | |
visLayout(randomSeed = 123456) %>% | |
visInteraction(zoomView = FALSE, dragView = FALSE) %>% | |
visGroups(groupname = "coauthor", color = "#3288bd") %>% | |
visGroups(groupname = "paper", color = "#d53e4f") %>% | |
visGroups(groupname = "data", color = "#abdda4") %>% | |
visGroups(groupname = "software", color = "#fdae61") %>% | |
visNodes(scaling = list(min = 15, max = 25)) %>% | |
visEdges(width = 3, color = "grey") %>% | |
visSave("./author_network.html") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment