mkiang/collab_network.R

## collab_network.R
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2018/06/17/my-collaboration-network/
library(scholar) # devtools::install_github("jkeirstead/scholar")
library(visNetwork)
library(tidyverse)

## Constants
MIN_TIME <- 60 * 5
MAX_TIME <- 60 * 30
PROJ_WEIGHT <- .8
NODE_WEIGHT <- 1
my_scholar_id <- "eD9_J3wAAAAJ"

## If we've already pulled from Google Scholar, don't do it again. The sleep
## timer makes this an unreasonably long process.
if (!file.exists("./scholar_pulls.rda")) {
    ## Pull publication history of myself
    mvk_df <- get_publications(my_scholar_id)

    ## Drop the Opioid Trends preprint so we're not double-counting
    mvk_df <- mvk_df %>%
        filter(pubid != "-yGd096yOn8C") %>%
        mutate_at(vars(one_of("title", "author", "journal",
                              "number", "cid", "pubid")),
                  as.character)

    ## Fix UTF-8 encoding -- once here and once later.
    mvk_df <- mvk_df %>%
        mutate(author = gsub("é|\xe9", "e", author))

    ## The summary df does not have a full author list so we need that. Set a
    ## generous sleep timer to stay below rate limit. It seems like one
    ## pull every 15 to 30 minutes is fairly safe.
    authors <- NULL
    pubid   <- NULL
    for (p in mvk_df$pubid) {
        print(p)
        authors <- c(authors, getCompleteAuthors(my_scholar_id, p))
        pubid   <- c(pubid, p)
        Sys.sleep(runif(1, min = MIN_TIME, max = MAX_TIME))
    }

    ## Save it so we don't have to pull again.
    save(authors, pubid, mvk_df, file = "./scholar_pulls.rda")
}

## Now munging data
load("./scholar_pulls.rda")

## Split out every author into their own column
full_authors <- tibble(pubid, f_authors = authors)
full_authors <- full_authors %>%
    ## Fix encoding
    mutate(f_authors = gsub("é|\xe9", "e", f_authors)) %>%
    ## Fix Rob's middle initial
    mutate(f_authors = gsub("W Moeller", "M Moeller", f_authors)) %>%
    ## Fix Jen's middle initial
    mutate(f_authors = gsub("Jennifer Hayes", "Jennifer E Hayes",
                            f_authors)) %>%
    separate(f_authors, into = sprintf("author_%02i", 1:30),
             remove = FALSE, sep = ", ", fill = "right")

## Remove all the completely empty columns
full_authors <- full_authors[, colSums(!is.na(full_authors)) > 0]

## Merge back to original dataframe
mvk_df <- mvk_df %>%
    left_join(full_authors)

## Get a list of unique co-authors for the node list
distinct_authors <- unique(
    unlist(strsplit(full_authors$f_authors, ", ", fixed = TRUE))
    )
distinct_authors <- distinct_authors[distinct_authors != "Mathew V Kiang"]

## Now let's make a node list
nodes <- bind_rows(
    mvk_df %>%
        select(title) %>%
        mutate(value = PROJ_WEIGHT,
               group = "paper",
               id = title),
    tibble(
        title = distinct_authors,
        value = NODE_WEIGHT,
        group = "coauthor",
        id = title
        )
    )

## Add some github side projects
nodes <- nodes %>%
    add_case(title = "narcan",
             value = PROJ_WEIGHT,
             group = "software",
             id = title) %>%
    add_case(title = "metabeiwe",
             value = PROJ_WEIGHT,
             group  = "software",
             id = title) %>%
    add_case(title = "beiwe_data_sample",
             value = PROJ_WEIGHT,
             group  = "data",
             id = title)

## Now the edge list -- reshape our merged dataframe into an edgelist
## going from paper to co-author
edges <- mvk_df %>%
    select(title, starts_with("author_")) %>%
    gather(key = "author", value = "from", starts_with("author_")) %>%
    filter(!is.na(from), from != "Mathew V Kiang") %>%
    select(to = title, from)

## Add cases for the non-papers
edges <- edges %>%
    add_case(to = "narcan", from = "Monica J Alexander") %>%
    add_case(to = "beiwe_data_sample", from = "Jukka-Pekka Onnela") %>%
    add_case(to = "beiwe_data_sample", from = "Jeanette Lorme")

visNetwork(nodes, edges) %>%
    visLayout(randomSeed = 123456) %>%
    visInteraction(zoomView = FALSE, dragView = FALSE) %>%
    visGroups(groupname = "coauthor", color = "#3288bd") %>%
    visGroups(groupname = "paper", color = "#d53e4f") %>%
    visGroups(groupname = "data", color = "#abdda4") %>%
    visGroups(groupname = "software", color = "#fdae61") %>%
    visNodes(scaling = list(min = 15, max = 25)) %>%
    visEdges(width = 3, color = "grey") %>%
    visSave("./author_network.html")
	## Visualizing authorship networks: code for this blog post:
	## https://mathewkiang.com/2018/06/17/my-collaboration-network/
	library(scholar) # devtools::install_github("jkeirstead/scholar")
	library(visNetwork)
	library(tidyverse)

	## Constants
	MIN_TIME <- 60 * 5
	MAX_TIME <- 60 * 30
	PROJ_WEIGHT <- .8
	NODE_WEIGHT <- 1
	my_scholar_id <- "eD9_J3wAAAAJ"

	## If we've already pulled from Google Scholar, don't do it again. The sleep
	## timer makes this an unreasonably long process.
	if (!file.exists("./scholar_pulls.rda")) {
	## Pull publication history of myself
	mvk_df <- get_publications(my_scholar_id)

	## Drop the Opioid Trends preprint so we're not double-counting
	mvk_df <- mvk_df %>%
	filter(pubid != "-yGd096yOn8C") %>%
	mutate_at(vars(one_of("title", "author", "journal",
	"number", "cid", "pubid")),
	as.character)

	## Fix UTF-8 encoding -- once here and once later.
	mvk_df <- mvk_df %>%
	mutate(author = gsub("é\|\xe9", "e", author))

	## The summary df does not have a full author list so we need that. Set a
	## generous sleep timer to stay below rate limit. It seems like one
	## pull every 15 to 30 minutes is fairly safe.
	authors <- NULL
	pubid <- NULL
	for (p in mvk_df$pubid) {
	print(p)
	authors <- c(authors, getCompleteAuthors(my_scholar_id, p))
	pubid <- c(pubid, p)
	Sys.sleep(runif(1, min = MIN_TIME, max = MAX_TIME))
	}

	## Save it so we don't have to pull again.
	save(authors, pubid, mvk_df, file = "./scholar_pulls.rda")
	}

	## Now munging data
	load("./scholar_pulls.rda")

	## Split out every author into their own column
	full_authors <- tibble(pubid, f_authors = authors)
	full_authors <- full_authors %>%
	## Fix encoding
	mutate(f_authors = gsub("é\|\xe9", "e", f_authors)) %>%
	## Fix Rob's middle initial
	mutate(f_authors = gsub("W Moeller", "M Moeller", f_authors)) %>%
	## Fix Jen's middle initial
	mutate(f_authors = gsub("Jennifer Hayes", "Jennifer E Hayes",
	f_authors)) %>%
	separate(f_authors, into = sprintf("author_%02i", 1:30),
	remove = FALSE, sep = ", ", fill = "right")

	## Remove all the completely empty columns
	full_authors <- full_authors[, colSums(!is.na(full_authors)) > 0]

	## Merge back to original dataframe
	mvk_df <- mvk_df %>%
	left_join(full_authors)

	## Get a list of unique co-authors for the node list
	distinct_authors <- unique(
	unlist(strsplit(full_authors$f_authors, ", ", fixed = TRUE))
	)
	distinct_authors <- distinct_authors[distinct_authors != "Mathew V Kiang"]

	## Now let's make a node list
	nodes <- bind_rows(
	mvk_df %>%
	select(title) %>%
	mutate(value = PROJ_WEIGHT,
	group = "paper",
	id = title),
	tibble(
	title = distinct_authors,
	value = NODE_WEIGHT,
	group = "coauthor",
	id = title
	)
	)

	## Add some github side projects
	nodes <- nodes %>%
	add_case(title = "narcan",
	value = PROJ_WEIGHT,
	group = "software",
	id = title) %>%
	add_case(title = "metabeiwe",
	value = PROJ_WEIGHT,
	group = "software",
	id = title) %>%
	add_case(title = "beiwe_data_sample",
	value = PROJ_WEIGHT,
	group = "data",
	id = title)

	## Now the edge list -- reshape our merged dataframe into an edgelist
	## going from paper to co-author
	edges <- mvk_df %>%
	select(title, starts_with("author_")) %>%
	gather(key = "author", value = "from", starts_with("author_")) %>%
	filter(!is.na(from), from != "Mathew V Kiang") %>%
	select(to = title, from)

	## Add cases for the non-papers
	edges <- edges %>%
	add_case(to = "narcan", from = "Monica J Alexander") %>%
	add_case(to = "beiwe_data_sample", from = "Jukka-Pekka Onnela") %>%
	add_case(to = "beiwe_data_sample", from = "Jeanette Lorme")

	visNetwork(nodes, edges) %>%
	visLayout(randomSeed = 123456) %>%
	visInteraction(zoomView = FALSE, dragView = FALSE) %>%
	visGroups(groupname = "coauthor", color = "#3288bd") %>%
	visGroups(groupname = "paper", color = "#d53e4f") %>%
	visGroups(groupname = "data", color = "#abdda4") %>%
	visGroups(groupname = "software", color = "#fdae61") %>%
	visNodes(scaling = list(min = 15, max = 25)) %>%
	visEdges(width = 3, color = "grey") %>%
	visSave("./author_network.html")