mkiang/01_pull_data.R

## 01_pull_data.R
## Visualizing authorship networks: code for this blog post:
##  https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots

## Imports ----
library(scholar) # remotes::install_github("jkeirstead/scholar")
library(tidyverse)
library(fs)
library(here)

## Constants ----
## Note that to avoid the API rate limiter, we use a sleep timer and draw
## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a *lot* of
## papers, this will take an unreasonably long time. You can tempt fate by
## lowering these bounds, but you do so at your own risk. At the default
## settings, this downloads the author and citation data for each paper at
## ~15 minutes each per paper (~30 minutes total if you need to download
## the author list again).
MIN_TIME <- 60 * 5
MAX_TIME <- 60 * 35
NO_CACHE <- FALSE
my_id <- "eD9_J3wAAAAJ"

## Set up folders ----
fs::dir_create(here::here("plots"))
fs::dir_create(here::here("data"))
fs::dir_create(here::here("temp_files"))

## Helper functions ----
## pub_id can have special characters. Just remove them.
sanitize_text <- function(string) {
    gsub("[^[:alnum:]]", "", string)
}

## Wrapper for a sleep timer.
random_sleep <- function(min_time, max_time) {
    Sys.sleep(stats::runif(1, min_time, max_time))
}

## Extract a sanitized pubid from a filename
extract_sanitized_pubid <- function(file_name) {
    base_f <- basename(file_name)
    substr(base_f, 6, nchar(base_f) - 4)
}

## Wrapper for gsub() to fix author names
fix_author_name <- function(df, string1, string2) {
    df %>%
        dplyr::mutate(all_authors = gsub(string1, string2, all_authors))
}

## Get a list of publications ----
pubs_df <- scholar::get_publications(my_id)

## Loop through every pub and get (1) citation history and (2) full
## author list, skipping over pubs we have already pulled. Note that
## this takes a long time to avoid hitting the rate limit.
for (p in unique(pubs_df$pubid)) {
    print(sprintf("Starting (%s): %s", Sys.time(), p))

    cite_file <-
        here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p)))
    auth_file <-
        here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p)))

    if (!fs::file_exists(cite_file) | NO_CACHE) {
        cite_df <- scholar::get_article_cite_history(my_id, p)
        saveRDS(cite_df, cite_file)
        random_sleep(MIN_TIME, MAX_TIME)
    }

    if (!fs::file_exists(auth_file) | NO_CACHE) {
        auth_df <- scholar::get_complete_authors(my_id, p)
        saveRDS(auth_df, auth_file)
        random_sleep(MIN_TIME, MAX_TIME)
    }

    print(sprintf("Completed (%s): %s", Sys.time(), p))
}

## Reshape publication dataframe ----
## We want to reshape the pubs dataframe so that every line is a paper-year
## so we can look at the number of citations per year.
pubs_expanded <- pubs_df %>%
    dplyr::mutate(
        journal = as.character(journal),
        title = as.character(title),
        pubid = as.character(pubid)
    ) %>%
    dplyr::select(title, journal, pubid, year) %>%
    dplyr::mutate(pub_type = "paper") %>%
    dplyr::group_by(title, journal, pubid, pub_type) %>%
    tidyr::expand(year = year:2020) %>%
    dplyr::ungroup()

## Add author data to dataframe ----
## Make a 2-col dataframe of (sanitized) pubid and author list
authors_df <- purrr::map_df(
    .x = fs::dir_ls("./temp_files", regexp = "auth_"),
    .f = ~  dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x),
                          all_authors = readRDS(.x))
)

## Fix author names
authors_df <- authors_df %>%
    fix_author_name("Mathew Kiang", "Mathew V Kiang") %>%
    fix_author_name("é|\xe9", "e") %>%
    fix_author_name("W Moeller", "M Moeller") %>%
    fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>%
    fix_author_name("Jarvis Chen", "Jarvis T Chen") %>%
    fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>%
    fix_author_name("Monica Alexander", "Monica J Alexander") %>%
    fix_author_name("Lyen Huang", "Lyen C Huang") %>%
    fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson") %>%
    fix_author_name("Caroline Buckee", "Caroline O Buckee") %>%
    fix_author_name("Kenth Eng\xf8-Monsen", "Kenth Engo-Monsen") %>%
    fix_author_name("Hern\xe1ndez", "Hernandez")

## Separate out into individual authors
authors_df <- authors_df %>%
    tidyr::separate(
        all_authors,
        into = sprintf("author_%02i", 1:30),
        remove = FALSE,
        sep = ", ",
        fill = "right"
    )

## Remove all the completely empty columns
authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0]

## Add citation history to dataframe ----
## Join full dataframe with author dataframe
pubs_expanded <- pubs_expanded %>%
    dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>%
    dplyr::left_join(authors_df, by = "sanitized_pubid") %>%
    dplyr::select(-sanitized_pubid)

## Add in individual year citations
pubs_expanded <- pubs_expanded %>%
    dplyr::left_join(
        purrr::map_df(
            .x = fs::dir_ls("./temp_files", regexp = "cite_"),
            .f = ~ readRDS(.x)
        ) %>%
            dplyr::rename(year_cites = cites),
        by = c("year", "pubid")
    )

## Fill in NAs and get cumulative citations
pubs_expanded <- pubs_expanded %>%
    dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>%
    dplyr::arrange(title, year) %>%
    dplyr::group_by(title, pubid) %>%
    dplyr::mutate(cume_cites = cumsum(year_cites)) %>%
    dplyr::ungroup() %>%
    left_join(pubs_df %>%
                  select(pubid, total_cites = cites))

## Reshape into long ----
## Now we want to reshape it so it is paper-year-author so that each row
## can be thought of as a node in a bipartite network of paper-authors and
## we can slice the network by years
##
## NOTE: igraph expects first two columns to define an edge list so we
## rearrange columns at the end
pubs_long <- pubs_expanded %>%
    dplyr::select(-all_authors) %>%
    tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>%
    dplyr::filter(!is.na(author)) %>%
    dplyr::arrange(year, title, author_position) %>%
    dplyr::select(from = title, to = author, dplyr::everything()) %>%
    dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9)))

## Save ----
saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS"))
saveRDS(pubs_long, here::here("data", "pubs_long.RDS"))

## 02_plotting_animations.R
## Visualizing authorship networks: code for this blog post:
##  https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots

## Imports ----
library(tidyverse)
library(fs)
library(here)

## Folders ----
dir_create(here("temp_files", "stills"))

## Data ----
pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>%
    filter(!(
        journal %in% c("APHA's",
                       "SocArXiv",
                       "European Population Conference",
                       "medRxiv",
                       "PAA"))
    )

## Create stills ----
pubs_long <- pubs_long_orig %>%
    left_join(
        pubs_long_orig %>%
            select(pubid, first_year = year) %>%
            distinct() %>%
            group_by(pubid) %>%
            filter(first_year == min(first_year)) %>%
            ungroup()
    ) %>%
    mutate(age = (year + 1) - first_year) %>%
    filter(to != "Mathew V Kiang") %>%
    add_graph_position(seed_x = 5, layout_x = "fr")

## Test the final network layout
plot_network(pubs_long, add_labels = FALSE, dim_old = TRUE)

p_layout <- "
AAB
AAC
DEF
"

## Plot for all years
for (year_x in 2010:2020) {
    p_network <- plot_network(pubs_long, year_x = year_x)
    # p_papers <- plot_unique_papers(pubs_long, year_x = year_x)
    p_papers <- plot_new_papers(pubs_long, year_x = year_x)
    # p_authors <- plot_unique_authors(pubs_long, year_x = year_x)
    p_authors <- plot_new_authors(pubs_long, year_x = year_x)
    # p_cites_by_paper <- plot_each_paper(pubs_long, year_x = year_x)
    # p_cites_by_paper <- plot_yearly_distro_cites(pubs_long, year_x = year_x)
    p_cites_by_paper <- plot_each_paper_age(pubs_long, year_x = year_x)
    p_cume_cites <- plot_cume_cites(pubs_long, year_x = year_x)
    p_yearly_cites <- plot_yearly_cites(pubs_long, year_x = year_x)

    p_save <- p_network + p_papers + p_authors +
        p_cites_by_paper + p_cume_cites + p_yearly_cites +
        plot_layout(design = p_layout)

    ggsave(
        here("temp_files", "stills", sprintf("year_%s.jpg", year_x)),
        p_save,
        width = 5,
        height = 5,
        dpi = 300,
        scale = 1.5
    )

    p_network <- plot_network(pubs_long, year_x = year_x, add_labels = TRUE)
    p_save <- p_network + p_papers + p_authors +
        p_cites_by_paper + p_cume_cites + p_yearly_cites +
        plot_layout(design = p_layout)
    ggsave(
        here("temp_files", "stills", "labeled", sprintf("labeled_year_%s.jpg", year_x)),
        p_save,
        width = 5,
        height = 5,
        dpi = 300,
        scale = 1.5
    )

}

## Combine stills ----
## This assumes you have FFmpeg and ImageMagick installed already
system("convert -resize 100% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo.gif")
system("convert -resize 100% -delay 100 -loop 1 ./temp_files/stills/labeled/*.jpg network_evo_labeled.gif")
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/*.jpg network_evo_half_size.gif")
system("convert -resize 50% -delay 100 -loop 1 ./temp_files/stills/labeled/*.jpg network_evo_labeled_half_size.gif")

## 03_static_plots.R
## See: https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots

library(here)
source(here("code", "utils.R"))
source(here("code", "mk_nytimes.R"))

## Data ----
pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>%
    filter(!(
        journal %in% c(
            "APHA's",
            "SocArXiv",
            "European Population Conference",
            "medRxiv",
            "PAA"
        )
    ))

pubs_long <- pubs_long_orig %>%
    left_join(
        pubs_long_orig %>%
            select(pubid, first_year = year) %>%
            distinct() %>%
            group_by(pubid) %>%
            filter(first_year == min(first_year)) %>%
            ungroup()
    ) %>%
    mutate(age = (year + 1) - first_year) %>%
    filter(to != "Mathew V Kiang")

## Collaboration timeline
collabs <- pubs_long %>%
    group_by(to, year) %>%
    summarize(n_pubs = sum(year == first_year)) %>%
    left_join(
        pubs_long %>%
            group_by(to) %>%
            summarize(
                min_year = min(first_year),
                max_year = max(first_year),
                n_pubs = n_distinct(pubid)
            ) %>%
            arrange(min_year, desc(n_pubs)) %>%
            ungroup() %>%
            select(-n_pubs) %>%
            mutate(y_val = 1:n()),
        by = "to"
    ) %>%
    filter(n_pubs > 0)

collabs_timeline <- ggplot() +
    geom_point(
        data = collabs %>%
            group_by(to) %>%
            slice(1),
        aes(
            x = min_year,
            y = y_val,
            group = to,
            size = n_pubs
        ),
        alpha = .5,
        shape = 21,
        fill = "black"
    ) +
    geom_point(
        data = collabs %>%
            group_by(to) %>%
            slice(1),
        aes(
            x = max_year,
            y = y_val,
            group = to,
            size = n_pubs
        ),
        alpha = .5,
        shape = 21,
        fill = "black"
    ) +
    geom_segment(
        data = collabs %>%
            group_by(to) %>%
            slice(1),
        aes(
            x = max_year,
            xend = min_year,
            y = y_val,
            yend = y_val,
            group = to
        ),
        alpha = .5
    ) +
    geom_point(
        data = collabs %>%
            filter(year != min_year,
                   year != max_year),
        aes(
            x = year,
            y = y_val,
            group = to,
            size = n_pubs
        ),
        position = position_jitter(
            width = .1,
            height = 0,
            seed = 1
        ),
        shape = 21
    ) +
    scale_x_continuous(NULL,
                       breaks = seq(2010, 2020, 2),
                       expand = c(.025, 0)) +
    scale_y_continuous("Collaborator", expand = c(.025, 0)) +
    mk_nytimes(axis.text.y = element_blank(),
               panel.grid.major.y = element_blank()) +
    scale_size_continuous("Collaborations")
ggsave("./colab_timeline.jpg",
       collabs_timeline,
       width = 6,
       height = 4.5)

ggsave(
    "./colab_timeline_labeled.jpg",
    collabs_timeline +
        geom_label_repel(
            data = collabs %>%
                group_by(to) %>%
                filter(year == max(year)),
            aes(x = year, y = y_val, label = to)
        ),
    width = 20,
    height = 12
)

## Upset plot
intersections_df <- pubs_long %>%
    filter(year == 2020) %>%
    select(author = to, paper = pubid) %>%
    mutate(coauthor = 1) %>%
    spread(author, coauthor, fill = 0) %>%
    mutate_if(is.numeric, as.integer)

jpeg(
    width = 6 * 300,
    height = 4.5 * 300,
    filename = "./upset_plot.jpg",
    quality = 100,
    res = 300
)
upset(
    as.data.frame(intersections_df),
    nsets = 10,
    order.by = "freq",
    decreasing = TRUE,
    mainbar.y.label = "Intersections of collaborators",
    sets.x.label = "Collaborations",
    mb.ratio = c(.65, .35)
)
dev.off()

## Collaborator "efficiency"
eff_df <- pubs_long %>%
    filter(year == 2020) %>%
    group_by(to) %>%
    summarize(
        most_recent_collab = 2020 - max(first_year),
        n_papers = n_distinct(from),
        avg_cites = mean(total_cites),
        total_cites = sum(total_cites)
    )

collab_eff <- ggplot() +
    geom_point(
        data = eff_df %>%
            filter(n_papers > 1),
        aes(
            x = n_papers,
            y = total_cites,
            fill = avg_cites,
            size = most_recent_collab
        ),
        alpha = .75,
        shape = 21
    ) +
    geom_point(
        data = eff_df %>%
            filter(n_papers == 1),
        aes(x = n_papers,
            y = total_cites,
            size = most_recent_collab),
        fill = "black",
        alpha = .25,
        shape = 21
    ) +
    scale_fill_viridis_b(
        "Citations per paper (among those with >1 collaboration)",
        # trans = "log1p",
        option = "C",
        breaks = seq(0, 160, 40),
        guide = guide_colorbar(
            title.position = "top",
            barwidth = unit(10, "cm"),
            barheight = unit(.25, "cm")
        )
    ) +
    scale_size_continuous(
        "Years since last collaboration",
        range = c(6, .05),
        breaks = c(0, 3, 7, 10),
        guide = guide_legend(title.position = "top")
    ) +
    scale_x_continuous(
        "Number of papers together",
        expand = c(.025, 0),
        breaks = 1:12,
        labels = c(1, "", "", 4, "", "", 7, "", "", 10, "", "")
    ) +
    scale_y_continuous("Total citations",
                       expand = c(.025, 0)) +
    mk_nytimes(legend.position = "bottom")
ggsave("./collaborator_efficiency.jpg",
       collab_eff,
       width = 8, height = 5, scale = 1)

ggsave("./collaborator_efficiency_labeled.jpg",
       collab_eff +
           geom_text_repel(
               data = x %>%
                   filter(n_papers > 1),
               aes(x = n_papers,
                   y = total_cites,
                   label = to),
               force = 10,
               max.iter = 10000,
               segment.alpha = .6,
               size = 3
           ),
       width = 8, height = 5, scale = 1.5)


## mk_nytimes.R
library(ggplot2)

#' A theme I often use -- NYTimes variation
#'
#' @param ... parameters to pass to theme()
#'
#' @return None
#' @export
#' @import ggplot2

mk_nytimes <- function(...) {
    ## http://minimaxir.com/2015/02/ggplot-tutorial/
    ## paste0('https://timogrossenbacher.ch/2016/12/',
    ##        'beautiful-thematic-maps-with-ggplot2-only/')
    ## https://github.com/hrbrmstr/hrbrthemes/blob/master/R/theme-ipsum.r

    ## Colos — stick with the ggplot2() greys
    c_bg    <- "white"
    c_grid  <- "grey80"
    c_btext <- "grey5"
    c_mtext <- "grey30"

    # Begin construction of chart
    ggplot2::theme_bw(base_size = 12, base_family = "Arial Narrow") +

        # Region
        ggplot2::theme(
            panel.background = ggplot2::element_rect(fill = c_bg, color = c_bg),
            plot.background  = ggplot2::element_rect(fill = c_bg, color = c_bg),
            panel.border     = ggplot2::element_rect(color = c_bg)
        ) +

        # Grid
        ggplot2::theme(
            panel.grid.major = ggplot2::element_line(color = c_grid, size = .25),
            panel.grid.minor = ggplot2::element_blank(),
            axis.ticks       = ggplot2::element_blank()
        ) +

        # Legend
        ggplot2::theme(
            legend.position = c(0, 1),
            legend.justification = c(0, 1),
            legend.key           = ggplot2::element_rect(fill = NA, color = NA),
            legend.background    = ggplot2::element_rect(fill = "transparent", color = NA),
            legend.text          = ggplot2::element_text(color = c_mtext)
        ) +

        # Titles, labels, etc.
        ggplot2::theme(
            plot.title     = ggplot2::element_text(
                color = c_btext,
                vjust = 1.25,
                face = "bold",
                size = 18
            ),
            axis.text      = ggplot2::element_text(size = 10, color = c_mtext),
            axis.title.x   = ggplot2::element_text(
                size = 12,
                color = c_mtext,
                hjust = 1
            ),
            axis.title.y   = ggplot2::element_text(
                size = 12,
                color = c_mtext,
                hjust = 1
            )
        ) +
        # Facets
        ggplot2::theme(
            strip.background = ggplot2::element_rect(fill = c_bg, color = c_btext),
            strip.text = ggplot2::element_text(size = 10, color = c_btext)
        ) +

        # Plot margins
        ggplot2::theme(plot.margin = ggplot2::unit(c(0.35, 0.2, 0.3, 0.35), "cm")) +

        # Additionals
        ggplot2::theme(...)
}

## utils.R
library(igraph)
library(ggraph)
library(patchwork)
library(tidyverse)
library(ggrepel)
library(UpSetR)

## Helper functions ----
return_graph_layout <-
    function(pubs_long,
             year_x = 2020,
             layout_x = "fr",
             seed_x = 123,
             ...) {
        sub_graph <-
            igraph::graph_from_data_frame(pubs_long %>%
                                              filter(year == year_x),
                                          directed = FALSE)
        V(sub_graph)$type <- bipartite_mapping(sub_graph)$type

        set.seed(seed_x)
        sub_graph <- create_layout(sub_graph, layout = layout_x, niter = 5000)

        return(sub_graph %>%
                   select(name, x, y))
    }

add_graph_position <- function(pubs_long,
                               year_x = 2020,
                               layout_x = "fr",
                               seed_x = 123) {
    pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x = seed_x)

    pubs_long %>%
        left_join(pos_df %>%
                      select(
                          from = name,
                          from_x = x,
                          from_y = y
                      )) %>%
        left_join(pos_df %>%
                      select(
                          to = name,
                          to_x = x,
                          to_y = y
                      ))
}

plot_network <- function(pubs_long, year_x = 2020, add_labels = FALSE, dim_old = TRUE) {
    x_range <- range(c(pubs_long$to_x, pubs_long$from_x))
    y_range <- range(c(pubs_long$to_y, pubs_long$from_y))

    if (dim_old) {
        alpha_vals <- c(.05, 1)
    } else {
        alpha_vals <- c(1, 1)
    }

    sub_df <- pubs_long %>%
        filter(year == year_x) %>%
        ungroup()

    p1 <- ggplot() +
        ## Edges
        geom_curve(
            data = sub_df,
            aes(
                x = from_x,
                y = from_y,
                xend = to_x,
                yend = to_y,
                alpha = (1/(age^5)) * .5
            ),
            curvature = 0
        ) +
        ## Papers
        geom_point(
            data = sub_df,
            aes(x = from_x, y = from_y, size = cume_cites, alpha = 1/(age^3)),
            shape = 21,
            color = "white",
            fill = "#377eb8"
        ) +
        ## Collaborators
        geom_point(
            data = sub_df,
            aes(x = to_x, y = to_y, alpha = 1/(age^3)),
            size = 1.75,
            shape = 21,
            color = "white",
            fill = "#e41a1c"
        )

    if (add_labels) {
        p1 <- p1 +
            ## Labels
            geom_label_repel(
                data = pubs_long %>%
                    group_by(pubid) %>%
                    filter(year == min(year)) %>%
                    ungroup() %>%
                    filter(year == year_x) %>%
                    select(-starts_with("to"), -author_position) %>%
                    distinct(),
                size = 1.5,
                alpha = .6,
                aes(x = from_x, y = from_y,
                    label = stringr::str_wrap(from, 40)),
                color = "black"
            )
    }

    p1 +
        scale_x_continuous(NULL,
                           limits = c(floor(x_range[1]), ceiling(x_range[2]))) +
        scale_y_continuous(NULL,
                           limits = c(floor(y_range[1]), ceiling(y_range[2]))) +
        scale_size_binned(
            "Paper citations",
            range = c(.5, 8),
            limits = c(0, max(pubs_long$cume_cites)),
            breaks = c(5, 25, 50, 100),
            guide = guide_bins(title.position = "top",
                               direction = "horizontal",
                               label.theme = element_text(size = 6))
        ) +
        scale_alpha_continuous(NULL,
                               range = alpha_vals,
                               guide = "none",
                               limits = c(0, 1)) +
        theme_void() +
        theme(legend.position = c(1, 0),
              legend.justification = c(1, 0),
              legend.title = element_text(size = 8)) +
        labs(title = year_x)
}

plot_new_authors <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
    if (dim_old) {
        alpha_vals <- c(.1, 1)
    } else {
        alpha_vals <- c(1, 1)
    }

    auths <- tibble()

    for (y in sort(unique(pubs_long$year))) {
        all_authors_before <- pubs_long %>%
            filter(year < y) %>%
            pull(to) %>%
            unique()

        current_authors <- pubs_long %>%
            filter(year <= y) %>%
            pull(to) %>%
            unique()

        auths <- bind_rows(
            auths,
            tibble(
                year = y,
                new_authors = NROW(setdiff(current_authors, all_authors_before))
            )
        )

    }

    ggplot() +
        geom_col(
            data = auths %>%
                filter(year <= year_x) %>%
                mutate(age = year_x - year + 1),
            aes(x = year, y = new_authors, alpha = 1 / (age)),
            color = NA,
            fill = "#e41a1c",
            size = .8
        ) +
        geom_hline(yintercept =  seq(0, 20, 5),
                   color = "white") +
        geom_hline(yintercept = mean(auths$new_authors),
                   linetype = "dotted",
                   size = .5,
                   alpha = .5) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           breaks = seq(0, 20, 5),
                           limits = c(0,
                                      max(auths$new_authors + 1))) +
        scale_alpha_continuous(NULL,
                               range = alpha_vals,
                               guide = "none",
                               limits = c(0, 1)) +
        labs(title = "New collaborators") +
        theme(plot.title = element_text(size = 9))
}

plot_unique_authors <- function(pubs_long, year_x = 2020) {
    auths <- pubs_long %>%
        group_by(year) %>%
        summarize(unique_auths = n_distinct(to)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = auths,
            aes(x = year, y = unique_auths),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_auths),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = auths %>%
                filter(year <= year_x),
            aes(x = year, y = unique_auths),
            color = "#e41a1c",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_auths),
            color = "#e41a1c",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(min(auths$unique_auths) - 1,
                                      max(auths$unique_auths + 1))) +
        labs(title = "Unique collaborators") +
        theme(plot.title = element_text(size = 9))
}

plot_author_scatter <- function(pubs_long, year_x) {
    auths <- pubs_long %>%
        group_by(to, year) %>%
        summarize(n_papers = n_distinct(pubid),
                  avg_cites = mean(cume_cites),
                  total_cites = sum(cume_cites)) %>%
        group_by(to) %>%
        mutate(age = year - min(year) + 1)

    sub_df <- auths %>%
        filter(year <= year_x)

    ggplot(sub_df,
           aes(
               x = n_papers,
               y = avg_cites,
               size = total_cites,
               alpha = 1 / age ^ 2
           )) +
        geom_point() +
        theme_classic() +
        scale_x_continuous(
            "Number of collaborations",
            expand = c(0, 0),
            limits = c(min(x$n_papers) - .25,
                       max(x$n_papers) + .25)
        ) +
        scale_y_continuous("Citations per collaboration",
                           expand = c(.035, 0),
                           limits = c(0, max(x$avg_cites))) +
        scale_alpha_continuous(range = c(.05, 1),
                               guide = "none") +
        scale_size_area("Total citations",
                        limits = range(x$total_cites)) +
        labs(title = "Citations by collaborator") +
        theme(plot.title = element_text(size = 9),
              axis.title = element_text(size = 8),
              legend.position = c(1, 1),
              legend.justification = c(1, 1),
              legend.title = element_text(size = 8))
}

plot_new_papers <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
    if (dim_old) {
        alpha_vals <- c(.1, 1)
    } else {
        alpha_vals <- c(1, 1)
    }

    auths <- pubs_long %>%
        group_by(pubid) %>%
        filter(year == min(year)) %>%
        group_by(year) %>%
        summarize(new_project = n_distinct(from))

    ggplot() +
        geom_col(
            data = auths %>%
                filter(year <= year_x) %>%
                mutate(age = year_x - year + 1),
            aes(
                x = year,
                y = new_project,
                alpha = 1 / (age)
            ),
            color = NA,
            fill = "#377eb8",
            size = .8
        ) +
        geom_hline(yintercept = seq(0, 8, 2),
                   color = "white") +
        geom_hline(
            yintercept = mean(auths$new_project),
            linetype = "dotted",
            size = .5,
            alpha = .5
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_alpha_continuous(NULL,
                               range = alpha_vals,
                               guide = "none",
                               limits = c(0, 1)) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           breaks = seq(0, 6, 2),
                           limits = c(0,
                                      max(auths$new_project + .5))) +
        labs(title = "New manuscripts") +
        theme(plot.title = element_text(size = 9))
}

plot_unique_papers <- function(pubs_long, year_x = 2020) {
    auths <- pubs_long %>%
        group_by(year) %>%
        summarize(unique_papers = n_distinct(from)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = auths,
            aes(x = year, y = unique_papers),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_papers),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = auths %>%
                filter(year <= year_x),
            aes(x = year, y = unique_papers),
            color = "#377eb8",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = auths %>%
                filter(year == year_x),
            aes(x = year, y = unique_papers),
            color = "#377eb8",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(min(auths$unique_papers) - 1,
                                      max(auths$unique_papers + 1))) +
        labs(title = "Unique papers/projects") +
        theme(plot.title = element_text(size = 9))
}

plot_yearly_cites <- function(pubs_long, year_x = 2020) {
    cites_df <- pubs_long %>%
        select(year, year_cites, paper = from) %>%
        distinct() %>%
        group_by(year) %>%
        summarize(year_cites = sum(year_cites)) %>%
        ungroup()

    ggplot() +
        geom_col(
            data = cites_df %>%
                filter(year <= year_x),
            aes(x = year, y = year_cites),
            color = NA,
            alpha = .8,
            size = .8
        ) +
        geom_hline(yintercept = seq(0, 400, 100),
                   color = "white") +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           breaks = seq(0, 400, 100),
                           limits = c(min(cites_df$year_cites) - 1,
                                      max(cites_df$year_cites + 1))) +
        labs(title = "New citations") +
        theme(plot.title = element_text(size = 9))
}

plot_cume_cites <- function(pubs_long, year_x = 2020) {
    cites_df <- pubs_long %>%
        select(year, cume_cites, paper = from) %>%
        distinct() %>%
        group_by(year) %>%
        summarize(cume_cites = sum(cume_cites)) %>%
        ungroup()

    ggplot() +
        geom_line(
            data = cites_df,
            aes(x = year, y = cume_cites),
            color = "black",
            alpha = .2
        ) +
        geom_point(
            data = cites_df %>%
                filter(year == year_x),
            aes(x = year, y = cume_cites),
            color = "white",
            alpha = 1,
            size = 2
        ) +
        geom_line(
            data = cites_df %>%
                filter(year <= year_x),
            aes(x = year, y = cume_cites),
            color = "black",
            alpha = .8,
            size = .8
        ) +
        geom_point(
            data = cites_df %>%
                filter(year == year_x),
            aes(x = year, y = cume_cites),
            color = "black",
            alpha = 1,
            size = 3
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           breaks = seq(0, 1500, 300),
                           limits = c(min(cites_df$cume_cites) - 1.5,
                                      max(cites_df$cume_cites + 15))) +
        labs(title = "Cumulative citations") +
        theme(plot.title = element_text(size = 9))
}

plot_each_paper <- function(pubs_long, year_x = 2020) {
    sub_df <- pubs_long %>%
        filter(year <= year_x)

    ggplot() +
        geom_line(
            data = pubs_long,
            aes(x = year, y = cume_cites, group = from),
            color = "black",
            alpha = .025
        ) +
        geom_point(
            data = sub_df %>%
                group_by(from) %>%
                filter(year %in% range(year)),
            aes(x = year, y = cume_cites, group = from),
            alpha = .2,
            size = .5
        ) +
        geom_line(data = sub_df,
                  aes(x = year,
                      y = cume_cites,
                      group = from),
                  alpha = .35) +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = c(2009.5, 2020.5),
            breaks = c(2010, 2015, 2020)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(
                               min(pubs_long$cume_cites) - 1.5,
                               max(pubs_long$cume_cites + 1.5)
                           )) +
        labs(title = "Cumulative citations by paper") +
        theme_classic() +
        theme(plot.title = element_text(size = 9),
              legend.position = "none")
}

plot_each_paper_age <- function(pubs_long, year_x = 2020) {
    pubs_long <- pubs_long %>%
        group_by(pubid) %>%
        mutate(years_since_pub = year - min(year)) %>%
        ungroup() %>%
        select(years_since_pub, pubid, from, cume_cites, age, year) %>%
        distinct()

    sub_df <- pubs_long %>%
        filter(year <= year_x)

    ggplot() +
        geom_line(
            data = pubs_long,
            aes(x = years_since_pub,
                y = cume_cites,
                group = from),
            color = "black",
            alpha = .01,
            size = .1
        ) +
        geom_point(
            data = sub_df %>%
                group_by(from) %>%
                filter(year %in% range(year)),
            aes(x = years_since_pub, y = cume_cites, group = from),
            alpha = .3,
            size = .3
        ) +
        geom_line(
            data = sub_df,
            aes(x = years_since_pub,
                y = cume_cites,
                group = from),
            alpha = .25
        ) +
        scale_x_continuous(
            NULL,
            expand = c(0, .25),
            limits = range(pubs_long$years_since_pub),
            breaks = seq(0, 12, 2)
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           # limits = c(0, NA)
                           limits = c(
                               min(pubs_long$cume_cites) - 1.5,
                               max(pubs_long$cume_cites + 1.5)
                           ),
                           breaks = seq(0, 300, 50)
        ) +
        labs(title = "Cumulative paper citations by age") +
        theme_classic() +
        theme(plot.title = element_text(size = 9))
}

plot_yearly_distro_cites <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
    if (dim_old) {
        alpha_vals <- c(.1, 1)
    } else {
        alpha_vals <- c(1, 1)
    }

    sub_df <- pubs_long %>%
        select(year, year_cites, pubid, age) %>%
        distinct()

    ggplot() +
        geom_line(data = sub_df %>%
                      filter(year <= year_x) %>%
                      mutate(age = year_x - year + 1),
                  aes(x = year_cites, group = year, alpha = 1 / (age^5)),
                  color = "black",
                  size = .8,
                  stat="density",
        ) +
        theme_classic() +
        scale_x_continuous(
            NULL,
            expand = c(0, 0),
            trans = "log1p",
            breaks = c(0, 1, 5, 10, 25, 50, 100),
        ) +
        scale_y_continuous(NULL,
                           expand = c(0, 0),
                           limits = c(0, 1)) +
        scale_alpha_continuous(NULL,
                               range = alpha_vals,
                               guide = "none") +
        labs(title = "Citations by paper") +
        theme(plot.title = element_text(size = 9),
              axis.text.y = element_blank())
}
	## Visualizing authorship networks: code for this blog post:
	## https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots

	## Imports ----
	library(scholar) # remotes::install_github("jkeirstead/scholar")
	library(tidyverse)
	library(fs)
	library(here)

	## Constants ----
	## Note that to avoid the API rate limiter, we use a sleep timer and draw
	## from a Uniform ~ [MIN_TIME, MAX_TIME] in minutes. If you have a lot of
	## papers, this will take an unreasonably long time. You can tempt fate by
	## lowering these bounds, but you do so at your own risk. At the default
	## settings, this downloads the author and citation data for each paper at
	## ~15 minutes each per paper (~30 minutes total if you need to download
	## the author list again).
	MIN_TIME <- 60 * 5
	MAX_TIME <- 60 * 35
	NO_CACHE <- FALSE
	my_id <- "eD9_J3wAAAAJ"

	## Set up folders ----
	fs::dir_create(here::here("plots"))
	fs::dir_create(here::here("data"))
	fs::dir_create(here::here("temp_files"))

	## Helper functions ----
	## pub_id can have special characters. Just remove them.
	sanitize_text <- function(string) {
	gsub("[^[:alnum:]]", "", string)
	}

	## Wrapper for a sleep timer.
	random_sleep <- function(min_time, max_time) {
	Sys.sleep(stats::runif(1, min_time, max_time))
	}

	## Extract a sanitized pubid from a filename
	extract_sanitized_pubid <- function(file_name) {
	base_f <- basename(file_name)
	substr(base_f, 6, nchar(base_f) - 4)
	}

	## Wrapper for gsub() to fix author names
	fix_author_name <- function(df, string1, string2) {
	df %>%
	dplyr::mutate(all_authors = gsub(string1, string2, all_authors))
	}

	## Get a list of publications ----
	pubs_df <- scholar::get_publications(my_id)

	## Loop through every pub and get (1) citation history and (2) full
	## author list, skipping over pubs we have already pulled. Note that
	## this takes a long time to avoid hitting the rate limit.
	for (p in unique(pubs_df$pubid)) {
	print(sprintf("Starting (%s): %s", Sys.time(), p))

	cite_file <-
	here::here("temp_files", sprintf("cite_%s.RDS", sanitize_text(p)))
	auth_file <-
	here::here("temp_files", sprintf("auth_%s.RDS", sanitize_text(p)))

	if (!fs::file_exists(cite_file) \| NO_CACHE) {
	cite_df <- scholar::get_article_cite_history(my_id, p)
	saveRDS(cite_df, cite_file)
	random_sleep(MIN_TIME, MAX_TIME)
	}

	if (!fs::file_exists(auth_file) \| NO_CACHE) {
	auth_df <- scholar::get_complete_authors(my_id, p)
	saveRDS(auth_df, auth_file)
	random_sleep(MIN_TIME, MAX_TIME)
	}

	print(sprintf("Completed (%s): %s", Sys.time(), p))
	}

	## Reshape publication dataframe ----
	## We want to reshape the pubs dataframe so that every line is a paper-year
	## so we can look at the number of citations per year.
	pubs_expanded <- pubs_df %>%
	dplyr::mutate(
	journal = as.character(journal),
	title = as.character(title),
	pubid = as.character(pubid)
	) %>%
	dplyr::select(title, journal, pubid, year) %>%
	dplyr::mutate(pub_type = "paper") %>%
	dplyr::group_by(title, journal, pubid, pub_type) %>%
	tidyr::expand(year = year:2020) %>%
	dplyr::ungroup()

	## Add author data to dataframe ----
	## Make a 2-col dataframe of (sanitized) pubid and author list
	authors_df <- purrr::map_df(
	.x = fs::dir_ls("./temp_files", regexp = "auth_"),
	.f = ~ dplyr::tibble(sanitized_pubid = extract_sanitized_pubid(.x),
	all_authors = readRDS(.x))
	)

	## Fix author names
	authors_df <- authors_df %>%
	fix_author_name("Mathew Kiang", "Mathew V Kiang") %>%
	fix_author_name("é\|\xe9", "e") %>%
	fix_author_name("W Moeller", "M Moeller") %>%
	fix_author_name("Jennifer Hayes", "Jennifer E Hayes") %>%
	fix_author_name("Jarvis Chen", "Jarvis T Chen") %>%
	fix_author_name("Jukka Pekka Onnela", "Jukka-Pekka Onnela") %>%
	fix_author_name("Monica Alexander", "Monica J Alexander") %>%
	fix_author_name("Lyen Huang", "Lyen C Huang") %>%
	fix_author_name("Lizbeth Edmondson", "Lizabeth Edmondson") %>%
	fix_author_name("Caroline Buckee", "Caroline O Buckee") %>%
	fix_author_name("Kenth Eng\xf8-Monsen", "Kenth Engo-Monsen") %>%
	fix_author_name("Hern\xe1ndez", "Hernandez")

	## Separate out into individual authors
	authors_df <- authors_df %>%
	tidyr::separate(
	all_authors,
	into = sprintf("author_%02i", 1:30),
	remove = FALSE,
	sep = ", ",
	fill = "right"
	)

	## Remove all the completely empty columns
	authors_df <- authors_df[, colSums(!is.na(authors_df)) > 0]

	## Add citation history to dataframe ----
	## Join full dataframe with author dataframe
	pubs_expanded <- pubs_expanded %>%
	dplyr::mutate(sanitized_pubid = sanitize_text(pubid)) %>%
	dplyr::left_join(authors_df, by = "sanitized_pubid") %>%
	dplyr::select(-sanitized_pubid)

	## Add in individual year citations
	pubs_expanded <- pubs_expanded %>%
	dplyr::left_join(
	purrr::map_df(
	.x = fs::dir_ls("./temp_files", regexp = "cite_"),
	.f = ~ readRDS(.x)
	) %>%
	dplyr::rename(year_cites = cites),
	by = c("year", "pubid")
	)

	## Fill in NAs and get cumulative citations
	pubs_expanded <- pubs_expanded %>%
	dplyr::mutate(year_cites = ifelse(is.na(year_cites), 0, year_cites)) %>%
	dplyr::arrange(title, year) %>%
	dplyr::group_by(title, pubid) %>%
	dplyr::mutate(cume_cites = cumsum(year_cites)) %>%
	dplyr::ungroup() %>%
	left_join(pubs_df %>%
	select(pubid, total_cites = cites))

	## Reshape into long ----
	## Now we want to reshape it so it is paper-year-author so that each row
	## can be thought of as a node in a bipartite network of paper-authors and
	## we can slice the network by years
	##
	## NOTE: igraph expects first two columns to define an edge list so we
	## rearrange columns at the end
	pubs_long <- pubs_expanded %>%
	dplyr::select(-all_authors) %>%
	tidyr::gather(author_position, author, dplyr::starts_with("author_")) %>%
	dplyr::filter(!is.na(author)) %>%
	dplyr::arrange(year, title, author_position) %>%
	dplyr::select(from = title, to = author, dplyr::everything()) %>%
	dplyr::mutate(author_position = as.integer(substr(author_position, 8, 9)))

	## Save ----
	saveRDS(pubs_expanded, here::here("data", "pubs_wide.RDS"))
	saveRDS(pubs_long, here::here("data", "pubs_long.RDS"))
	## See: https://mathewkiang.com/2020/12/10/my-collaboration-network-for-2010-to-2020-other-plots

	library(here)
	source(here("code", "utils.R"))
	source(here("code", "mk_nytimes.R"))

	## Data ----
	pubs_long_orig <- readRDS(here("data", "pubs_long.RDS")) %>%
	filter(!(
	journal %in% c(
	"APHA's",
	"SocArXiv",
	"European Population Conference",
	"medRxiv",
	"PAA"
	)
	))

	pubs_long <- pubs_long_orig %>%
	left_join(
	pubs_long_orig %>%
	select(pubid, first_year = year) %>%
	distinct() %>%
	group_by(pubid) %>%
	filter(first_year == min(first_year)) %>%
	ungroup()
	) %>%
	mutate(age = (year + 1) - first_year) %>%
	filter(to != "Mathew V Kiang")

	## Collaboration timeline
	collabs <- pubs_long %>%
	group_by(to, year) %>%
	summarize(n_pubs = sum(year == first_year)) %>%
	left_join(
	pubs_long %>%
	group_by(to) %>%
	summarize(
	min_year = min(first_year),
	max_year = max(first_year),
	n_pubs = n_distinct(pubid)
	) %>%
	arrange(min_year, desc(n_pubs)) %>%
	ungroup() %>%
	select(-n_pubs) %>%
	mutate(y_val = 1:n()),
	by = "to"
	) %>%
	filter(n_pubs > 0)

	collabs_timeline <- ggplot() +
	geom_point(
	data = collabs %>%
	group_by(to) %>%
	slice(1),
	aes(
	x = min_year,
	y = y_val,
	group = to,
	size = n_pubs
	),
	alpha = .5,
	shape = 21,
	fill = "black"
	) +
	geom_point(
	data = collabs %>%
	group_by(to) %>%
	slice(1),
	aes(
	x = max_year,
	y = y_val,
	group = to,
	size = n_pubs
	),
	alpha = .5,
	shape = 21,
	fill = "black"
	) +
	geom_segment(
	data = collabs %>%
	group_by(to) %>%
	slice(1),
	aes(
	x = max_year,
	xend = min_year,
	y = y_val,
	yend = y_val,
	group = to
	),
	alpha = .5
	) +
	geom_point(
	data = collabs %>%
	filter(year != min_year,
	year != max_year),
	aes(
	x = year,
	y = y_val,
	group = to,
	size = n_pubs
	),
	position = position_jitter(
	width = .1,
	height = 0,
	seed = 1
	),
	shape = 21
	) +
	scale_x_continuous(NULL,
	breaks = seq(2010, 2020, 2),
	expand = c(.025, 0)) +
	scale_y_continuous("Collaborator", expand = c(.025, 0)) +
	mk_nytimes(axis.text.y = element_blank(),
	panel.grid.major.y = element_blank()) +
	scale_size_continuous("Collaborations")
	ggsave("./colab_timeline.jpg",
	collabs_timeline,
	width = 6,
	height = 4.5)

	ggsave(
	"./colab_timeline_labeled.jpg",
	collabs_timeline +
	geom_label_repel(
	data = collabs %>%
	group_by(to) %>%
	filter(year == max(year)),
	aes(x = year, y = y_val, label = to)
	),
	width = 20,
	height = 12
	)

	## Upset plot
	intersections_df <- pubs_long %>%
	filter(year == 2020) %>%
	select(author = to, paper = pubid) %>%
	mutate(coauthor = 1) %>%
	spread(author, coauthor, fill = 0) %>%
	mutate_if(is.numeric, as.integer)

	jpeg(
	width = 6 * 300,
	height = 4.5 * 300,
	filename = "./upset_plot.jpg",
	quality = 100,
	res = 300
	)
	upset(
	as.data.frame(intersections_df),
	nsets = 10,
	order.by = "freq",
	decreasing = TRUE,
	mainbar.y.label = "Intersections of collaborators",
	sets.x.label = "Collaborations",
	mb.ratio = c(.65, .35)
	)
	dev.off()

	## Collaborator "efficiency"
	eff_df <- pubs_long %>%
	filter(year == 2020) %>%
	group_by(to) %>%
	summarize(
	most_recent_collab = 2020 - max(first_year),
	n_papers = n_distinct(from),
	avg_cites = mean(total_cites),
	total_cites = sum(total_cites)
	)

	collab_eff <- ggplot() +
	geom_point(
	data = eff_df %>%
	filter(n_papers > 1),
	aes(
	x = n_papers,
	y = total_cites,
	fill = avg_cites,
	size = most_recent_collab
	),
	alpha = .75,
	shape = 21
	) +
	geom_point(
	data = eff_df %>%
	filter(n_papers == 1),
	aes(x = n_papers,
	y = total_cites,
	size = most_recent_collab),
	fill = "black",
	alpha = .25,
	shape = 21
	) +
	scale_fill_viridis_b(
	"Citations per paper (among those with >1 collaboration)",
	# trans = "log1p",
	option = "C",
	breaks = seq(0, 160, 40),
	guide = guide_colorbar(
	title.position = "top",
	barwidth = unit(10, "cm"),
	barheight = unit(.25, "cm")
	)
	) +
	scale_size_continuous(
	"Years since last collaboration",
	range = c(6, .05),
	breaks = c(0, 3, 7, 10),
	guide = guide_legend(title.position = "top")
	) +
	scale_x_continuous(
	"Number of papers together",
	expand = c(.025, 0),
	breaks = 1:12,
	labels = c(1, "", "", 4, "", "", 7, "", "", 10, "", "")
	) +
	scale_y_continuous("Total citations",
	expand = c(.025, 0)) +
	mk_nytimes(legend.position = "bottom")
	ggsave("./collaborator_efficiency.jpg",
	collab_eff,
	width = 8, height = 5, scale = 1)

	ggsave("./collaborator_efficiency_labeled.jpg",
	collab_eff +
	geom_text_repel(
	data = x %>%
	filter(n_papers > 1),
	aes(x = n_papers,
	y = total_cites,
	label = to),
	force = 10,
	max.iter = 10000,
	segment.alpha = .6,
	size = 3
	),
	width = 8, height = 5, scale = 1.5)
	library(ggplot2)

	#' A theme I often use -- NYTimes variation
	#'
	#' @param ... parameters to pass to theme()
	#'
	#' @return None
	#' @export
	#' @import ggplot2

	mk_nytimes <- function(...) {
	## http://minimaxir.com/2015/02/ggplot-tutorial/
	## paste0('https://timogrossenbacher.ch/2016/12/',
	## 'beautiful-thematic-maps-with-ggplot2-only/')
	## https://github.com/hrbrmstr/hrbrthemes/blob/master/R/theme-ipsum.r

	## Colos — stick with the ggplot2() greys
	c_bg <- "white"
	c_grid <- "grey80"
	c_btext <- "grey5"
	c_mtext <- "grey30"

	# Begin construction of chart
	ggplot2::theme_bw(base_size = 12, base_family = "Arial Narrow") +

	# Region
	ggplot2::theme(
	panel.background = ggplot2::element_rect(fill = c_bg, color = c_bg),
	plot.background = ggplot2::element_rect(fill = c_bg, color = c_bg),
	panel.border = ggplot2::element_rect(color = c_bg)
	) +

	# Grid
	ggplot2::theme(
	panel.grid.major = ggplot2::element_line(color = c_grid, size = .25),
	panel.grid.minor = ggplot2::element_blank(),
	axis.ticks = ggplot2::element_blank()
	) +

	# Legend
	ggplot2::theme(
	legend.position = c(0, 1),
	legend.justification = c(0, 1),
	legend.key = ggplot2::element_rect(fill = NA, color = NA),
	legend.background = ggplot2::element_rect(fill = "transparent", color = NA),
	legend.text = ggplot2::element_text(color = c_mtext)
	) +

	# Titles, labels, etc.
	ggplot2::theme(
	plot.title = ggplot2::element_text(
	color = c_btext,
	vjust = 1.25,
	face = "bold",
	size = 18
	),
	axis.text = ggplot2::element_text(size = 10, color = c_mtext),
	axis.title.x = ggplot2::element_text(
	size = 12,
	color = c_mtext,
	hjust = 1
	),
	axis.title.y = ggplot2::element_text(
	size = 12,
	color = c_mtext,
	hjust = 1
	)
	) +
	# Facets
	ggplot2::theme(
	strip.background = ggplot2::element_rect(fill = c_bg, color = c_btext),
	strip.text = ggplot2::element_text(size = 10, color = c_btext)
	) +

	# Plot margins
	ggplot2::theme(plot.margin = ggplot2::unit(c(0.35, 0.2, 0.3, 0.35), "cm")) +

	# Additionals
	ggplot2::theme(...)
	}
	library(igraph)
	library(ggraph)
	library(patchwork)
	library(tidyverse)
	library(ggrepel)
	library(UpSetR)

	## Helper functions ----
	return_graph_layout <-
	function(pubs_long,
	year_x = 2020,
	layout_x = "fr",
	seed_x = 123,
	...) {
	sub_graph <-
	igraph::graph_from_data_frame(pubs_long %>%
	filter(year == year_x),
	directed = FALSE)
	V(sub_graph)$type <- bipartite_mapping(sub_graph)$type

	set.seed(seed_x)
	sub_graph <- create_layout(sub_graph, layout = layout_x, niter = 5000)

	return(sub_graph %>%
	select(name, x, y))
	}

	add_graph_position <- function(pubs_long,
	year_x = 2020,
	layout_x = "fr",
	seed_x = 123) {
	pos_df <- return_graph_layout(pubs_long, year_x, layout_x, seed_x = seed_x)

	pubs_long %>%
	left_join(pos_df %>%
	select(
	from = name,
	from_x = x,
	from_y = y
	)) %>%
	left_join(pos_df %>%
	select(
	to = name,
	to_x = x,
	to_y = y
	))
	}

	plot_network <- function(pubs_long, year_x = 2020, add_labels = FALSE, dim_old = TRUE) {
	x_range <- range(c(pubs_long$to_x, pubs_long$from_x))
	y_range <- range(c(pubs_long$to_y, pubs_long$from_y))

	if (dim_old) {
	alpha_vals <- c(.05, 1)
	} else {
	alpha_vals <- c(1, 1)
	}

	sub_df <- pubs_long %>%
	filter(year == year_x) %>%
	ungroup()

	p1 <- ggplot() +
	## Edges
	geom_curve(
	data = sub_df,
	aes(
	x = from_x,
	y = from_y,
	xend = to_x,
	yend = to_y,
	alpha = (1/(age^5)) * .5
	),
	curvature = 0
	) +
	## Papers
	geom_point(
	data = sub_df,
	aes(x = from_x, y = from_y, size = cume_cites, alpha = 1/(age^3)),
	shape = 21,
	color = "white",
	fill = "#377eb8"
	) +
	## Collaborators
	geom_point(
	data = sub_df,
	aes(x = to_x, y = to_y, alpha = 1/(age^3)),
	size = 1.75,
	shape = 21,
	color = "white",
	fill = "#e41a1c"
	)

	if (add_labels) {
	p1 <- p1 +
	## Labels
	geom_label_repel(
	data = pubs_long %>%
	group_by(pubid) %>%
	filter(year == min(year)) %>%
	ungroup() %>%
	filter(year == year_x) %>%
	select(-starts_with("to"), -author_position) %>%
	distinct(),
	size = 1.5,
	alpha = .6,
	aes(x = from_x, y = from_y,
	label = stringr::str_wrap(from, 40)),
	color = "black"
	)
	}

	p1 +
	scale_x_continuous(NULL,
	limits = c(floor(x_range[1]), ceiling(x_range[2]))) +
	scale_y_continuous(NULL,
	limits = c(floor(y_range[1]), ceiling(y_range[2]))) +
	scale_size_binned(
	"Paper citations",
	range = c(.5, 8),
	limits = c(0, max(pubs_long$cume_cites)),
	breaks = c(5, 25, 50, 100),
	guide = guide_bins(title.position = "top",
	direction = "horizontal",
	label.theme = element_text(size = 6))
	) +
	scale_alpha_continuous(NULL,
	range = alpha_vals,
	guide = "none",
	limits = c(0, 1)) +
	theme_void() +
	theme(legend.position = c(1, 0),
	legend.justification = c(1, 0),
	legend.title = element_text(size = 8)) +
	labs(title = year_x)
	}

	plot_new_authors <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
	if (dim_old) {
	alpha_vals <- c(.1, 1)
	} else {
	alpha_vals <- c(1, 1)
	}

	auths <- tibble()

	for (y in sort(unique(pubs_long$year))) {
	all_authors_before <- pubs_long %>%
	filter(year < y) %>%
	pull(to) %>%
	unique()

	current_authors <- pubs_long %>%
	filter(year <= y) %>%
	pull(to) %>%
	unique()

	auths <- bind_rows(
	auths,
	tibble(
	year = y,
	new_authors = NROW(setdiff(current_authors, all_authors_before))
	)
	)

	}

	ggplot() +
	geom_col(
	data = auths %>%
	filter(year <= year_x) %>%
	mutate(age = year_x - year + 1),
	aes(x = year, y = new_authors, alpha = 1 / (age)),
	color = NA,
	fill = "#e41a1c",
	size = .8
	) +
	geom_hline(yintercept = seq(0, 20, 5),
	color = "white") +
	geom_hline(yintercept = mean(auths$new_authors),
	linetype = "dotted",
	size = .5,
	alpha = .5) +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	breaks = seq(0, 20, 5),
	limits = c(0,
	max(auths$new_authors + 1))) +
	scale_alpha_continuous(NULL,
	range = alpha_vals,
	guide = "none",
	limits = c(0, 1)) +
	labs(title = "New collaborators") +
	theme(plot.title = element_text(size = 9))
	}

	plot_unique_authors <- function(pubs_long, year_x = 2020) {
	auths <- pubs_long %>%
	group_by(year) %>%
	summarize(unique_auths = n_distinct(to)) %>%
	ungroup()

	ggplot() +
	geom_line(
	data = auths,
	aes(x = year, y = unique_auths),
	color = "black",
	alpha = .2
	) +
	geom_point(
	data = auths %>%
	filter(year == year_x),
	aes(x = year, y = unique_auths),
	color = "white",
	alpha = 1,
	size = 2
	) +
	geom_line(
	data = auths %>%
	filter(year <= year_x),
	aes(x = year, y = unique_auths),
	color = "#e41a1c",
	alpha = .8,
	size = .8
	) +
	geom_point(
	data = auths %>%
	filter(year == year_x),
	aes(x = year, y = unique_auths),
	color = "#e41a1c",
	alpha = 1,
	size = 3
	) +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	limits = c(min(auths$unique_auths) - 1,
	max(auths$unique_auths + 1))) +
	labs(title = "Unique collaborators") +
	theme(plot.title = element_text(size = 9))
	}

	plot_author_scatter <- function(pubs_long, year_x) {
	auths <- pubs_long %>%
	group_by(to, year) %>%
	summarize(n_papers = n_distinct(pubid),
	avg_cites = mean(cume_cites),
	total_cites = sum(cume_cites)) %>%
	group_by(to) %>%
	mutate(age = year - min(year) + 1)

	sub_df <- auths %>%
	filter(year <= year_x)

	ggplot(sub_df,
	aes(
	x = n_papers,
	y = avg_cites,
	size = total_cites,
	alpha = 1 / age ^ 2
	)) +
	geom_point() +
	theme_classic() +
	scale_x_continuous(
	"Number of collaborations",
	expand = c(0, 0),
	limits = c(min(x$n_papers) - .25,
	max(x$n_papers) + .25)
	) +
	scale_y_continuous("Citations per collaboration",
	expand = c(.035, 0),
	limits = c(0, max(x$avg_cites))) +
	scale_alpha_continuous(range = c(.05, 1),
	guide = "none") +
	scale_size_area("Total citations",
	limits = range(x$total_cites)) +
	labs(title = "Citations by collaborator") +
	theme(plot.title = element_text(size = 9),
	axis.title = element_text(size = 8),
	legend.position = c(1, 1),
	legend.justification = c(1, 1),
	legend.title = element_text(size = 8))
	}

	plot_new_papers <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
	if (dim_old) {
	alpha_vals <- c(.1, 1)
	} else {
	alpha_vals <- c(1, 1)
	}

	auths <- pubs_long %>%
	group_by(pubid) %>%
	filter(year == min(year)) %>%
	group_by(year) %>%
	summarize(new_project = n_distinct(from))

	ggplot() +
	geom_col(
	data = auths %>%
	filter(year <= year_x) %>%
	mutate(age = year_x - year + 1),
	aes(
	x = year,
	y = new_project,
	alpha = 1 / (age)
	),
	color = NA,
	fill = "#377eb8",
	size = .8
	) +
	geom_hline(yintercept = seq(0, 8, 2),
	color = "white") +
	geom_hline(
	yintercept = mean(auths$new_project),
	linetype = "dotted",
	size = .5,
	alpha = .5
	) +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_alpha_continuous(NULL,
	range = alpha_vals,
	guide = "none",
	limits = c(0, 1)) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	breaks = seq(0, 6, 2),
	limits = c(0,
	max(auths$new_project + .5))) +
	labs(title = "New manuscripts") +
	theme(plot.title = element_text(size = 9))
	}

	plot_unique_papers <- function(pubs_long, year_x = 2020) {
	auths <- pubs_long %>%
	group_by(year) %>%
	summarize(unique_papers = n_distinct(from)) %>%
	ungroup()

	ggplot() +
	geom_line(
	data = auths,
	aes(x = year, y = unique_papers),
	color = "black",
	alpha = .2
	) +
	geom_point(
	data = auths %>%
	filter(year == year_x),
	aes(x = year, y = unique_papers),
	color = "white",
	alpha = 1,
	size = 2
	) +
	geom_line(
	data = auths %>%
	filter(year <= year_x),
	aes(x = year, y = unique_papers),
	color = "#377eb8",
	alpha = .8,
	size = .8
	) +
	geom_point(
	data = auths %>%
	filter(year == year_x),
	aes(x = year, y = unique_papers),
	color = "#377eb8",
	alpha = 1,
	size = 3
	) +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	limits = c(min(auths$unique_papers) - 1,
	max(auths$unique_papers + 1))) +
	labs(title = "Unique papers/projects") +
	theme(plot.title = element_text(size = 9))
	}

	plot_yearly_cites <- function(pubs_long, year_x = 2020) {
	cites_df <- pubs_long %>%
	select(year, year_cites, paper = from) %>%
	distinct() %>%
	group_by(year) %>%
	summarize(year_cites = sum(year_cites)) %>%
	ungroup()

	ggplot() +
	geom_col(
	data = cites_df %>%
	filter(year <= year_x),
	aes(x = year, y = year_cites),
	color = NA,
	alpha = .8,
	size = .8
	) +
	geom_hline(yintercept = seq(0, 400, 100),
	color = "white") +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	breaks = seq(0, 400, 100),
	limits = c(min(cites_df$year_cites) - 1,
	max(cites_df$year_cites + 1))) +
	labs(title = "New citations") +
	theme(plot.title = element_text(size = 9))
	}

	plot_cume_cites <- function(pubs_long, year_x = 2020) {
	cites_df <- pubs_long %>%
	select(year, cume_cites, paper = from) %>%
	distinct() %>%
	group_by(year) %>%
	summarize(cume_cites = sum(cume_cites)) %>%
	ungroup()

	ggplot() +
	geom_line(
	data = cites_df,
	aes(x = year, y = cume_cites),
	color = "black",
	alpha = .2
	) +
	geom_point(
	data = cites_df %>%
	filter(year == year_x),
	aes(x = year, y = cume_cites),
	color = "white",
	alpha = 1,
	size = 2
	) +
	geom_line(
	data = cites_df %>%
	filter(year <= year_x),
	aes(x = year, y = cume_cites),
	color = "black",
	alpha = .8,
	size = .8
	) +
	geom_point(
	data = cites_df %>%
	filter(year == year_x),
	aes(x = year, y = cume_cites),
	color = "black",
	alpha = 1,
	size = 3
	) +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	breaks = seq(0, 1500, 300),
	limits = c(min(cites_df$cume_cites) - 1.5,
	max(cites_df$cume_cites + 15))) +
	labs(title = "Cumulative citations") +
	theme(plot.title = element_text(size = 9))
	}

	plot_each_paper <- function(pubs_long, year_x = 2020) {
	sub_df <- pubs_long %>%
	filter(year <= year_x)

	ggplot() +
	geom_line(
	data = pubs_long,
	aes(x = year, y = cume_cites, group = from),
	color = "black",
	alpha = .025
	) +
	geom_point(
	data = sub_df %>%
	group_by(from) %>%
	filter(year %in% range(year)),
	aes(x = year, y = cume_cites, group = from),
	alpha = .2,
	size = .5
	) +
	geom_line(data = sub_df,
	aes(x = year,
	y = cume_cites,
	group = from),
	alpha = .35) +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = c(2009.5, 2020.5),
	breaks = c(2010, 2015, 2020)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	limits = c(
	min(pubs_long$cume_cites) - 1.5,
	max(pubs_long$cume_cites + 1.5)
	)) +
	labs(title = "Cumulative citations by paper") +
	theme_classic() +
	theme(plot.title = element_text(size = 9),
	legend.position = "none")
	}

	plot_each_paper_age <- function(pubs_long, year_x = 2020) {
	pubs_long <- pubs_long %>%
	group_by(pubid) %>%
	mutate(years_since_pub = year - min(year)) %>%
	ungroup() %>%
	select(years_since_pub, pubid, from, cume_cites, age, year) %>%
	distinct()

	sub_df <- pubs_long %>%
	filter(year <= year_x)

	ggplot() +
	geom_line(
	data = pubs_long,
	aes(x = years_since_pub,
	y = cume_cites,
	group = from),
	color = "black",
	alpha = .01,
	size = .1
	) +
	geom_point(
	data = sub_df %>%
	group_by(from) %>%
	filter(year %in% range(year)),
	aes(x = years_since_pub, y = cume_cites, group = from),
	alpha = .3,
	size = .3
	) +
	geom_line(
	data = sub_df,
	aes(x = years_since_pub,
	y = cume_cites,
	group = from),
	alpha = .25
	) +
	scale_x_continuous(
	NULL,
	expand = c(0, .25),
	limits = range(pubs_long$years_since_pub),
	breaks = seq(0, 12, 2)
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	# limits = c(0, NA)
	limits = c(
	min(pubs_long$cume_cites) - 1.5,
	max(pubs_long$cume_cites + 1.5)
	),
	breaks = seq(0, 300, 50)
	) +
	labs(title = "Cumulative paper citations by age") +
	theme_classic() +
	theme(plot.title = element_text(size = 9))
	}

	plot_yearly_distro_cites <- function(pubs_long, year_x = 2020, dim_old = TRUE) {
	if (dim_old) {
	alpha_vals <- c(.1, 1)
	} else {
	alpha_vals <- c(1, 1)
	}

	sub_df <- pubs_long %>%
	select(year, year_cites, pubid, age) %>%
	distinct()

	ggplot() +
	geom_line(data = sub_df %>%
	filter(year <= year_x) %>%
	mutate(age = year_x - year + 1),
	aes(x = year_cites, group = year, alpha = 1 / (age^5)),
	color = "black",
	size = .8,
	stat="density",
	) +
	theme_classic() +
	scale_x_continuous(
	NULL,
	expand = c(0, 0),
	trans = "log1p",
	breaks = c(0, 1, 5, 10, 25, 50, 100),
	) +
	scale_y_continuous(NULL,
	expand = c(0, 0),
	limits = c(0, 1)) +
	scale_alpha_continuous(NULL,
	range = alpha_vals,
	guide = "none") +
	labs(title = "Citations by paper") +
	theme(plot.title = element_text(size = 9),
	axis.text.y = element_blank())
	}