mkiang/01_pull_data.R

## 01_pull_data.R
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/

## Imports ----
library(scholar) # remotes::install_github("jkeirstead/scholar")
library(tidyverse)
library(fs)
library(here)

## Constants ----
## Note that to avoid the API rate limiter, we use a sleep timer and draw
## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a *lot* of
## papers, this will take an unreasonably long time. You can tempt fate by
## lowering these bounds, but you do so at your own risk. At the default
## settings, this downloads the author and citation data for each paper at
## ~17 minutes per paper (35 minutes total).
MIN_TIME <- 60 * 5
MAX_TIME <- 60 * 30
PROJ_WEIGHT <- .8
NODE_WEIGHT <- 1
NO_CACHE <- FALSE
my_id <- "eD9_J3wAAAAJ"

## Set up folders ----
fs::dir_create(here::here("plots"))
fs::dir_create(here::here("data"))
fs::dir_create(here::here("temp_files"))

## Helper functions ----
## pub_id can have special characters. Just remove them.
sanitize_text <- function(string) {
    gsub("[^[:alnum:]]", "", string)
}

## Wrapper for a sleep timer.
random_sleep <- function(min_time, max_time) {
    Sys.sleep(stats::runif(1, min_time, max_time))
}

## Extract a sanitized pubid from a filename
extract_sanitized_pubid <- function(file_name) {
    base_f <- basename(file_name)
    substr(base_f, 6, nchar(base_f) - 4)
}

## Wrapper for gsub() to fix author names
fix_author_name <- function(df, string1, string2) {
    df %>%
        dplyr::mutate(all_authors = gsub(string1, string2, all_authors))
}


## Get a list of publications ----
pubs_df <- scholar::get_publications(my_id)

## Loop through every pub and get (1) citation history and (2) full
## author list, skipping over pubs we have already pulled. Note that
## this takes a long time to avoid hitting the rate limit.
for (p in unique(pubs_df$pubid)) {
    cite_file <-
        here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p)))
    auth_file <-
        here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p)))

    if (!fs::file_exists(cite_file) | NO_CACHE) {
        cite_df <- scholar::get_article_cite_history(my_id, p)
        saveRDS(cite_df, cite_file)
        random_sleep(MIN_TIME, MAX_TIME)
    }

    if (!fs::file_exists(auth_file) | NO_CACHE) {
        auth_df <- scholar::get_complete_authors(my_id, p)
        saveRDS(auth_df, auth_file)
        random_sleep(MIN_TIME, MAX_TIME)
    }

    # print(sprintf("Completed: %s", p))
}

## Create non-publication dataframes for (1) projects and (2) authors ----
## Add in non-publication projects (e.g., software, data)
nonpub_projects <- dplyr::tribble(
    ~ title,
    ~ journal,
    ~ pubid,
    ~ year,
    ~ pub_type,
    "Narcan (R package)",
    NA,
    "narcan",
    2017,
    "software",
    "Beiwe Data Sample",
    NA,
    "beiwe_data_sample",
    2018,
    "data",
    "Meta-Beiwe (R Package)",
    NA,
    "metabeiwe",
    2018,
    "software"
)

## Each nonpub project needs an author list and matching pubid
nonpub_authors <- dplyr::tribble(
    ~ sanitized_pubid,
    ~ all_authors,
    "narcan",
    "Mathew V Kiang, Monica J Alexander",
    "beiwe_data_sample",
    "Mathew V Kiang, Jeanette Lorme, Jukka-Pekka Onnela",
    "metabeiwe",
    "Mathew V Kiang"
)

## Reshape publication dataframe ----
## We want to reshape the pubs dataframe so that every line is a paper-year
## so we can look at the number of citations per year.
pubs_expanded <- pubs_df %>%
    dplyr::mutate(
        journal = as.character(journal),
        title = as.character(title),
        pubid = as.character(pubid)
    ) %>%
    dplyr::select(title, journal, pubid, year) %>%
    dplyr::mutate(pub_type = "paper") %>%
    dplyr::bind_rows(nonpub_projects) %>%
    dplyr::group_by(title, journal, pubid, pub_type) %>%
    tidyr::expand(year = year:2019) %>%
    dplyr::ungroup()

## Add author data to dataframe ----
## Make a 2-col dataframe of (sanitized) pubid and author list
authors_df <- purrr::map_df(
    .x = fs::dir_ls("./temp_files", regexp = "auth_"),
    .f = ~  dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x),
                          all_authors = readRDS(.x))
) %>%
    dplyr::bind_rows(nonpub_authors)

## Fix author names
authors_df <- authors_df %>%
    fix_author_name("é|\xe9", "e") %>%
    fix_author_name("W Moeller", "M Moeller") %>%
    fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>%
    fix_author_name("Jarvis Chen", "Jarvis T Chen") %>%
    fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>%
    fix_author_name("Monica Alexander", "Monica J Alexander") %>%
    fix_author_name("Lyen Huang", "Lyen C Huang") %>%
    fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson")

## Separate out into individual authors
authors_df <- authors_df %>%
    tidyr::separate(
        all_authors,
        into = sprintf("author_%02i", 1:20),
        remove = FALSE,
        sep = ", ",
        fill = "right"
    )

## Remove all the completely empty columns
authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0]

## Add citation history to dataframe ----
## Join full dataframe with author dataframe
pubs_expanded <- pubs_expanded %>%
    dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>%
    dplyr::left_join(authors_df, by = "sanitized_pubid") %>%
    dplyr::select(-sanitized_pubid)

## Add in individual year citations
pubs_expanded <- pubs_expanded %>%
    dplyr::left_join(
        purrr::map_df(
            .x = fs::dir_ls("./temp_files", regexp = "cite_"),
            .f = ~ readRDS(.x)
        ) %>%
            dplyr::rename(year_cites = cites),
        by = c("year", "pubid")
    )

## Fill in NAs and get cumulative citations
pubs_expanded <- pubs_expanded %>%
    dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>%
    dplyr::arrange(title, year) %>%
    dplyr::group_by(title, pubid) %>%
    dplyr::mutate(cume_cites = cumsum(year_cites)) %>%
    dplyr::ungroup()

## Reshape into long ----
## Now we want to reshape it so it is paper-year-author so that each row
## can be thought of as a node in a bipartite network of paper-authors and
## we can slice the network by years
##
## NOTE: igraph expects first two columns to define an edge list so we
## rearrange columns at the end
pubs_long <- pubs_expanded %>%
    dplyr::select(-all_authors) %>%
    tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>%
    dplyr::filter(!is.na(author)) %>%
    dplyr::arrange(year, title, author_position) %>%
    dplyr::select(from = title, to = author, dplyr::everything()) %>%
    dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9)))

## Save ----
saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS"))
saveRDS(pubs_long, here::here("data", "pubs_long.RDS"))

## 02_plot_data.R
## Visualizing authorship networks: code for this blog post:
## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/


## Imports ----
library(igraph)
library(ggraph)
library(patchwork)
library(tidyverse)
library(fs)
library(here)

## Folders ----
dir_create(here("temp_files", "stills"))

## Data ----
pubs_long <- readRDS(here("data", "pubs_long.RDS"))

## Helper functions ----
return_graph_layout <-
    function(pubs_long,
             year_x = 2019,
             layout_x = "fr",
             seed_x = 12345) {
        sub_graph <-
            igraph::graph_from_data_frame(pubs_long %>%
                                              filter(year == year_x),
                                          directed = FALSE)
        set.seed(seed_x)
        sub_graph <-
            create_layout(sub_graph, layout = layout_x, maxiter = 100)

        return(sub_graph %>%
                   select(name, x, y))
    }

add_graph_position <- function(pubs_long,
                               year_x = 2019,
                               layout_x = "fr",
                               seed_x = 12345) {
    pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x)

    pubs_long %>%
        left_join(pos_df %>%
                      select(
                          from = name,
                          from_x = x,
                          from_y = y
                      )) %>%
        left_join(pos_df %>%
                      select(
                          to = name,
                          to_x = x,
                          to_y = y
                      ))
}

plot_network <- function(pubs_long, year_x = 2019) {
    x_range <- range(c(pubs_long$to_x, pubs_long$from_x))
    y_range <- range(c(pubs_long$to_y, pubs_long$from_y))

    sub_df <- pubs_long %>%
        add_graph_position() %>%
        filter(year == year_x) %>%
        ungroup()

    ggplot() +
        ## Edges
        geom_curve(
            data = sub_df,
            aes(
                x = from_x,
                y = from_y,
                xend = to_x,
                yend = to_y
            ),
            curvature = .1,
            alpha = .9
        ) +
        ## Papers
        geom_point(
            data = sub_df,
            aes(x = from_x, y = from_y, size = cume_cites),
            shape = 21,
            color = "white",
            fill = "#377eb8"
        ) +
        ## Collaborators
        geom_point(
            data = sub_df,
            aes(x = to_x, y = to_y),
            size = 2,
            shape = 21,
            color = "white",
            fill = "#e41a1c"
        ) +
        scale_x_continuous(NULL,
                           limits = c(floor(x_range[1]), ceiling(x_range[2]))) +
        scale_y_continuous(NULL,
                           limits = c(floor(y_range[1]), ceiling(y_range[2]))) +
        scale_size_continuous(
            "Cumulative\ncitations",
            range = c(2, 8),
            limits = c(0, max(pubs_long$cume_cites) + 1),
            breaks = c(0, 50, 100)
        ) +
        theme_void() +
        theme(legend.position = "bottom",
              legend.title = element_text(size = 8)) +
        labs(title = year_x)
}

plot_unique_authors <- function(pubs_long, year_x = 2019) {
    auths <- pubs_long %>%
        group_by(year) %>%
        summarize(unique_auths = n_distinct(to)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = auths,
            aes(x = year, y = unique_auths),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_auths),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = auths %>%
                filter(year <= year_x),
            aes(x = year, y = unique_auths),
            color = "#e41a1c",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_auths),
            color = "#e41a1c",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2010, 2019),
            breaks = c(2010, 2014, 2018)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(min(auths$unique_auths) - 1,
                                      max(auths$unique_auths + 1))) +
        labs(title = "Unique collaborators") +
        theme(plot.title = element_text(size = 9))
}

plot_unique_papers <- function(pubs_long, year_x = 2019) {
    auths <- pubs_long %>%
        group_by(year) %>%
        summarize(unique_papers = n_distinct(from)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = auths,
            aes(x = year, y = unique_papers),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_papers),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = auths %>%
                filter(year <= year_x),
            aes(x = year, y = unique_papers),
            color = "#377eb8",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_papers),
            color = "#377eb8",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2010, 2019),
            breaks = c(2010, 2014, 2018)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(min(auths$unique_papers) - 1,
                                      max(auths$unique_papers + 1))) +
        labs(title = "Unique papers/projects") +
        theme(plot.title = element_text(size = 9))
}

plot_yearly_cites <- function(pubs_long, year_x = 2019) {
    cites_df <- pubs_long %>%
        select(year, year_cites, paper = from) %>%
        distinct() %>%
        group_by(year) %>%
        summarize(year_cites = sum(year_cites)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = cites_df,
            aes(x = year, y = year_cites),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = cites_df %>%
                filter(year == year_x),
            aes(x = year, y = year_cites),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = cites_df %>%
                filter(year <= year_x),
            aes(x = year, y = year_cites),
            color = "black",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = cites_df %>%
                filter(year == year_x),
            aes(x = year, y = year_cites),
            color = "black",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2010, 2019),
            breaks = c(2010, 2014, 2018)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(min(cites_df$year_cites) - 1,
                                      max(cites_df$year_cites + 1))) +
        labs(title = "Yearly citations") +
        theme(plot.title = element_text(size = 9))
}

plot_cume_cites <- function(pubs_long, year_x = 2019) {
    cites_df <- pubs_long %>%
        select(year, cume_cites, paper = from) %>%
        distinct() %>%
        group_by(year) %>%
        summarize(cume_cites = sum(cume_cites)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = cites_df,
            aes(x = year, y = cume_cites),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = cites_df %>%
                filter(year == year_x),
            aes(x = year, y = cume_cites),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = cites_df %>%
                filter(year <= year_x),
            aes(x = year, y = cume_cites),
            color = "black",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = cites_df %>%
                filter(year == year_x),
            aes(x = year, y = cume_cites),
            color = "black",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2010, 2019),
            breaks = c(2010, 2014, 2018)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(min(cites_df$cume_cites) - 1,
                                      max(cites_df$cume_cites + 1))) +
        labs(title = "Cumulative citations") +
        theme(plot.title = element_text(size = 9))
}

plot_each_paper <- function(pubs_long, year_x = 2019) {
    sub_df <- pubs_long %>%
        filter(year <= year_x)

    ggplot() +
        geom_line(
            data = pubs_long,
            aes(x = year, y = cume_cites, group = from),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = sub_df %>%
                group_by(from) %>%
                filter(year %in% range(year)),
            aes(x = year, y = cume_cites, group = from),
            alpha = .8
        ) +
        geom_line(data = sub_df,
                  aes(x = year,
                      y = cume_cites,
                      group = from),
                  alpha = .8) +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2010, 2019),
            breaks = c(2010, 2014, 2018)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(
                               min(pubs_long$cume_cites) - 1,
                               max(pubs_long$cume_cites + 1)
                           )) +
        labs(title = "Citations by paper") +
        theme_classic() +
        theme(plot.title = element_text(size = 9))
}

## Create stills ----
pubs_long <- pubs_long %>%
    filter(to != "Mathew V Kiang") %>%
    add_graph_position()

p_layout <- "
AAB
AAC
DEF
"

for (year_x in 2010:2019) {
    p_network <- plot_network(pubs_long, year_x = year_x)
    p_papers <- plot_unique_papers(pubs_long, year_x = year_x)
    p_authors <- plot_unique_authors(pubs_long, year_x = year_x)
    p_cites_by_paper <- plot_each_paper(pubs_long, year_x = year_x)
    p_cume_cites <- plot_cume_cites(pubs_long, year_x = year_x)
    p_yearly_cites <- plot_yearly_cites(pubs_long, year_x = year_x)

    p_save <- p_network + p_papers + p_authors +
        p_yearly_cites + p_cites_by_paper +
        p_cume_cites + plot_layout(design = p_layout)

    ggsave(
        here("temp_files", "stills", sprintf("year_%s.jpg", year_x)),
        p_save,
        width = 5,
        height = 5,
        dpi = 300,
        scale = 1.5
    )
}

## Combine stills ----
## This assumes you have FFmpeg and ImageMagick installed already
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo.gif")
	## Visualizing authorship networks: code for this blog post:
	## https://mathewkiang.com/2019/12/07/collaboration-network-from-2010-to-2019/

	## Imports ----
	library(scholar) # remotes::install_github("jkeirstead/scholar")
	library(tidyverse)
	library(fs)
	library(here)

	## Constants ----
	## Note that to avoid the API rate limiter, we use a sleep timer and draw
	## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a lot of
	## papers, this will take an unreasonably long time. You can tempt fate by
	## lowering these bounds, but you do so at your own risk. At the default
	## settings, this downloads the author and citation data for each paper at
	## ~17 minutes per paper (35 minutes total).
	MIN_TIME <- 60 * 5
	MAX_TIME <- 60 * 30
	PROJ_WEIGHT <- .8
	NODE_WEIGHT <- 1
	NO_CACHE <- FALSE
	my_id <- "eD9_J3wAAAAJ"

	## Set up folders ----
	fs::dir_create(here::here("plots"))
	fs::dir_create(here::here("data"))
	fs::dir_create(here::here("temp_files"))

	## Helper functions ----
	## pub_id can have special characters. Just remove them.
	sanitize_text <- function(string) {
	gsub("[^[:alnum:]]", "", string)
	}

	## Wrapper for a sleep timer.
	random_sleep <- function(min_time, max_time) {
	Sys.sleep(stats::runif(1, min_time, max_time))
	}

	## Extract a sanitized pubid from a filename
	extract_sanitized_pubid <- function(file_name) {
	base_f <- basename(file_name)
	substr(base_f, 6, nchar(base_f) - 4)
	}

	## Wrapper for gsub() to fix author names
	fix_author_name <- function(df, string1, string2) {
	df %>%
	dplyr::mutate(all_authors = gsub(string1, string2, all_authors))
	}



	## Get a list of publications ----
	pubs_df <- scholar::get_publications(my_id)

	## Loop through every pub and get (1) citation history and (2) full
	## author list, skipping over pubs we have already pulled. Note that
	## this takes a long time to avoid hitting the rate limit.
	for (p in unique(pubs_df$pubid)) {
	cite_file <-
	here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p)))
	auth_file <-
	here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p)))

	if (!fs::file_exists(cite_file) \| NO_CACHE) {
	cite_df <- scholar::get_article_cite_history(my_id, p)
	saveRDS(cite_df, cite_file)
	random_sleep(MIN_TIME, MAX_TIME)
	}

	if (!fs::file_exists(auth_file) \| NO_CACHE) {
	auth_df <- scholar::get_complete_authors(my_id, p)
	saveRDS(auth_df, auth_file)
	random_sleep(MIN_TIME, MAX_TIME)
	}

	# print(sprintf("Completed: %s", p))
	}

	## Create non-publication dataframes for (1) projects and (2) authors ----
	## Add in non-publication projects (e.g., software, data)
	nonpub_projects <- dplyr::tribble(
	~ title,
	~ journal,
	~ pubid,
	~ year,
	~ pub_type,
	"Narcan (R package)",
	NA,
	"narcan",
	2017,
	"software",
	"Beiwe Data Sample",
	NA,
	"beiwe_data_sample",
	2018,
	"data",
	"Meta-Beiwe (R Package)",
	NA,
	"metabeiwe",
	2018,
	"software"
	)

	## Each nonpub project needs an author list and matching pubid
	nonpub_authors <- dplyr::tribble(
	~ sanitized_pubid,
	~ all_authors,
	"narcan",
	"Mathew V Kiang, Monica J Alexander",
	"beiwe_data_sample",
	"Mathew V Kiang, Jeanette Lorme, Jukka-Pekka Onnela",
	"metabeiwe",
	"Mathew V Kiang"
	)

	## Reshape publication dataframe ----
	## We want to reshape the pubs dataframe so that every line is a paper-year
	## so we can look at the number of citations per year.
	pubs_expanded <- pubs_df %>%
	dplyr::mutate(
	journal = as.character(journal),
	title = as.character(title),
	pubid = as.character(pubid)
	) %>%
	dplyr::select(title, journal, pubid, year) %>%
	dplyr::mutate(pub_type = "paper") %>%
	dplyr::bind_rows(nonpub_projects) %>%
	dplyr::group_by(title, journal, pubid, pub_type) %>%
	tidyr::expand(year = year:2019) %>%
	dplyr::ungroup()

	## Add author data to dataframe ----
	## Make a 2-col dataframe of (sanitized) pubid and author list
	authors_df <- purrr::map_df(
	.x = fs::dir_ls("./temp_files", regexp = "auth_"),
	.f = ~ dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x),
	all_authors = readRDS(.x))
	) %>%
	dplyr::bind_rows(nonpub_authors)

	## Fix author names
	authors_df <- authors_df %>%
	fix_author_name("é\|\xe9", "e") %>%
	fix_author_name("W Moeller", "M Moeller") %>%
	fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>%
	fix_author_name("Jarvis Chen", "Jarvis T Chen") %>%
	fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>%
	fix_author_name("Monica Alexander", "Monica J Alexander") %>%
	fix_author_name("Lyen Huang", "Lyen C Huang") %>%
	fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson")

	## Separate out into individual authors
	authors_df <- authors_df %>%
	tidyr::separate(
	all_authors,
	into = sprintf("author_%02i", 1:20),
	remove = FALSE,
	sep = ", ",
	fill = "right"
	)

	## Remove all the completely empty columns
	authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0]

	## Add citation history to dataframe ----
	## Join full dataframe with author dataframe
	pubs_expanded <- pubs_expanded %>%
	dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>%
	dplyr::left_join(authors_df, by = "sanitized_pubid") %>%
	dplyr::select(-sanitized_pubid)

	## Add in individual year citations
	pubs_expanded <- pubs_expanded %>%
	dplyr::left_join(
	purrr::map_df(
	.x = fs::dir_ls("./temp_files", regexp = "cite_"),
	.f = ~ readRDS(.x)
	) %>%
	dplyr::rename(year_cites = cites),
	by = c("year", "pubid")
	)

	## Fill in NAs and get cumulative citations
	pubs_expanded <- pubs_expanded %>%
	dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>%
	dplyr::arrange(title, year) %>%
	dplyr::group_by(title, pubid) %>%
	dplyr::mutate(cume_cites = cumsum(year_cites)) %>%
	dplyr::ungroup()

	## Reshape into long ----
	## Now we want to reshape it so it is paper-year-author so that each row
	## can be thought of as a node in a bipartite network of paper-authors and
	## we can slice the network by years
	##
	## NOTE: igraph expects first two columns to define an edge list so we
	## rearrange columns at the end
	pubs_long <- pubs_expanded %>%
	dplyr::select(-all_authors) %>%
	tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>%
	dplyr::filter(!is.na(author)) %>%
	dplyr::arrange(year, title, author_position) %>%
	dplyr::select(from = title, to = author, dplyr::everything()) %>%
	dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9)))

	## Save ----
	saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS"))
	saveRDS(pubs_long, here::here("data", "pubs_long.RDS"))