Skip to content

Instantly share code, notes, and snippets.

@mkiang
Last active May 10, 2019 21:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mkiang/e12da9c9e5787b39bb6efa8e2098e5f4 to your computer and use it in GitHub Desktop.
Save mkiang/e12da9c9e5787b39bb6efa8e2098e5f4 to your computer and use it in GitHub Desktop.
Code to create your collaboration network in R
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2018/06/17/my-collaboration-network/
library(scholar) # devtools::install_github("jkeirstead/scholar")
library(visNetwork)
library(tidyverse)
## Constants
MIN_TIME <- 60 * 5
MAX_TIME <- 60 * 30
PROJ_WEIGHT <- .8
NODE_WEIGHT <- 1
my_scholar_id <- "eD9_J3wAAAAJ"
## If we've already pulled from Google Scholar, don't do it again. The sleep
## timer makes this an unreasonably long process.
if (!file.exists("./scholar_pulls.rda")) {
## Pull publication history of myself
mvk_df <- get_publications(my_scholar_id)
## Drop the Opioid Trends preprint so we're not double-counting
mvk_df <- mvk_df %>%
filter(pubid != "-yGd096yOn8C") %>%
mutate_at(vars(one_of("title", "author", "journal",
"number", "cid", "pubid")),
as.character)
## Fix UTF-8 encoding -- once here and once later.
mvk_df <- mvk_df %>%
mutate(author = gsub("é|\xe9", "e", author))
## The summary df does not have a full author list so we need that. Set a
## generous sleep timer to stay below rate limit. It seems like one
## pull every 15 to 30 minutes is fairly safe.
authors <- NULL
pubid <- NULL
for (p in mvk_df$pubid) {
print(p)
authors <- c(authors, getCompleteAuthors(my_scholar_id, p))
pubid <- c(pubid, p)
Sys.sleep(runif(1, min = MIN_TIME, max = MAX_TIME))
}
## Save it so we don't have to pull again.
save(authors, pubid, mvk_df, file = "./scholar_pulls.rda")
}
## Now munging data
load("./scholar_pulls.rda")
## Split out every author into their own column
full_authors <- tibble(pubid, f_authors = authors)
full_authors <- full_authors %>%
## Fix encoding
mutate(f_authors = gsub("é|\xe9", "e", f_authors)) %>%
## Fix Rob's middle initial
mutate(f_authors = gsub("W Moeller", "M Moeller", f_authors)) %>%
## Fix Jen's middle initial
mutate(f_authors = gsub("Jennifer Hayes", "Jennifer E Hayes",
f_authors)) %>%
separate(f_authors, into = sprintf("author_%02i", 1:30),
remove = FALSE, sep = ", ", fill = "right")
## Remove all the completely empty columns
full_authors <- full_authors[, colSums(!is.na(full_authors)) > 0]
## Merge back to original dataframe
mvk_df <- mvk_df %>%
left_join(full_authors)
## Get a list of unique co-authors for the node list
distinct_authors <- unique(
unlist(strsplit(full_authors$f_authors, ", ", fixed = TRUE))
)
distinct_authors <- distinct_authors[distinct_authors != "Mathew V Kiang"]
## Now let's make a node list
nodes <- bind_rows(
mvk_df %>%
select(title) %>%
mutate(value = PROJ_WEIGHT,
group = "paper",
id = title),
tibble(
title = distinct_authors,
value = NODE_WEIGHT,
group = "coauthor",
id = title
)
)
## Add some github side projects
nodes <- nodes %>%
add_case(title = "narcan",
value = PROJ_WEIGHT,
group = "software",
id = title) %>%
add_case(title = "metabeiwe",
value = PROJ_WEIGHT,
group = "software",
id = title) %>%
add_case(title = "beiwe_data_sample",
value = PROJ_WEIGHT,
group = "data",
id = title)
## Now the edge list -- reshape our merged dataframe into an edgelist
## going from paper to co-author
edges <- mvk_df %>%
select(title, starts_with("author_")) %>%
gather(key = "author", value = "from", starts_with("author_")) %>%
filter(!is.na(from), from != "Mathew V Kiang") %>%
select(to = title, from)
## Add cases for the non-papers
edges <- edges %>%
add_case(to = "narcan", from = "Monica J Alexander") %>%
add_case(to = "beiwe_data_sample", from = "Jukka-Pekka Onnela") %>%
add_case(to = "beiwe_data_sample", from = "Jeanette Lorme")
visNetwork(nodes, edges) %>%
visLayout(randomSeed = 123456) %>%
visInteraction(zoomView = FALSE, dragView = FALSE) %>%
visGroups(groupname = "coauthor", color = "#3288bd") %>%
visGroups(groupname = "paper", color = "#d53e4f") %>%
visGroups(groupname = "data", color = "#abdda4") %>%
visGroups(groupname = "software", color = "#fdae61") %>%
visNodes(scaling = list(min = 15, max = 25)) %>%
visEdges(width = 3, color = "grey") %>%
visSave("./author_network.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment