mdlincoln/bm.R Secret

## bm.R
library(dplyr)
library(lubridate)

#### Load raw output from database queries ####
bm_raw_print_nodes <- read.csv("data-raw/bm_print_nodes.csv", stringsAsFactors = FALSE, na.strings = "")
bm_raw_print_edges <- read.csv("data-raw/bm_print_edges.csv", stringsAsFactors = FALSE, na.strings = "")
bm_node_nationalities <- read.csv("data-raw/bm_node_nationalities.csv", stringsAsFactors = FALSE, na.strings = "")
bm_actor_notes <- read.csv("data-raw/bm_actor_notes.csv", stringsAsFactors = FALSE, na.strings = "")
bm_oa <- read.csv("data-raw/bm_obj_attributes.csv", stringsAsFactors = FALSE, na.strings = "")
bm_thesaurus <- read.csv("data-raw/bm_thesaurus.csv", stringsAsFactors = FALSE, na.strings = "")
bm_types <- read.csv("data-raw/bm_types.csv", stringsAsFactors = FALSE, na.strings = "")
bm_concordance <- read.csv("data-raw/bm_concordance.csv", stringsAsFactors = FALSE, na.strings = "")
bm_images <- read.csv("data-raw/bm_images.csv", header = FALSE, col.names = c("image", "object"), stringsAsFactors = FALSE, na.strings = "")
bm_pids <- read.csv("data-raw/bm_pids.csv", header = FALSE, col.names = c("object", "pid"), stringsAsFactors = FALSE, na.strings = "", skip = 1)


save(bm_concordance, file = "data/bm_concordance.RData")
save(bm_thesaurus, file = "data/bm_thesaurus.RData")
save(bm_types, file = "data/bm_types.RData")
save(bm_images, file = "data/bm_images.RData")
save(bm_pids, file = "data/bm_pids.RData")
save(bm_actor_notes, file = "data/bm_actor_notes.RData")
bm_oa <- bm_oa %>%
  mutate(
    datestart = ymd(datestart),
    dateend = ymd(dateend),
    year_start = year(datestart),
    year_end = year(dateend)
  ) %>%
  left_join(bm_images, by = "object")
save(bm_oa, file = "data/bm_oa.RData")


#### SET ANALYSIS DEFAULTS ####
daterange <- 1490:1760
window_size <- 5

#### Filter print_nodes and print_edges based on presets ####

# Parse date fields into year integers (very litte information is lost, as most
# dates are already of the format 'XXXX-01-01). This makes it much easier to
# filter nodes and edges based on date
bm_print_nodes <- bm_raw_print_nodes %>%
  mutate(birth = year(ymd(birth)), death = year(ymd(death)), name = id) %>%
  # Because igraph essentially erases the 'id' node attribute when constructing
  # graphs, copy those values into a column called 'name'; for the actual name,
  # uses 'realname'
  select(id, name, realname, birth, death) %>%
  # There are a few nultiple birth dates and death dates - this removes them,
  # keeping id a unique column
  distinct(id)

bm_print_edges <- bm_raw_print_edges %>% mutate(datestart = year(ymd(datestart)), dateend = year(ymd(dateend)))

# Set any nodes with "English" nationality to "British"
bm_node_nationalities$nationality[bm_node_nationalities$nationality == "British"] <- "English"

# Create a new pseudo-nationality for "Low Countries" that incorporates both
# Flemish and Dutch actors, and add it to the nationality table
lowcountries <- bm_node_nationalities %>%
  filter(nationality %in% c("Flemish", "Dutch")) %>%
  mutate(nationality = "Low Countries")

bm_node_nationalities <- bm_node_nationalities %>% bind_rows(lowcountries)

# Filter by date range, discarding any works finished before the start of
# analysis range or completed before the end of the analysis range
start_year <- first(daterange)
end_year <- last(daterange)
bm_print_edges <- bm_print_edges %>% filter(datestart <= end_year & start_year <= dateend)

# Join the birth and death date for the source and target artists of each edge
bm_print_edges <- bm_print_edges %>%
  inner_join(bm_print_nodes %>% select(id, source_birth = birth, source_death = death), by = c("source" = "id")) %>%
  inner_join(bm_print_nodes %>% select(id, target_birth = birth, target_death = death), by = c("target" = "id"))

# For each edge, check to see if the artists' lifespans ever overlapped. If so,
# artists are said to have coexisted, and thus edges between them will be set to
# bidirectional. If not, edges will only be one-way: with the contemporary
# artist as source and the past artist as target.
coexist <- Vectorize(function(a1b, a1d, a2b, a2d) {
  ifelse(
    anyNA(c(a1b, a1d, a2b, a2d)),
    FALSE, # If any artist dates are NA, return FALSE
    any((a1b + 15):a1d %in% (a2b + 15):a2d) # IF not, then figure out if they overlap
  )
})

bm_print_edges <- bm_print_edges %>% mutate(coexist = coexist(source_birth, source_death, target_birth, target_death))


preferred_types <- c("Printed", "After", "Published")
bm_print_edges <- bm_print_edges %>% filter(source_role %in% preferred_types & target_role %in% preferred_types)

# Select final variables to pass to analysis
bm_print_edges <- bm_print_edges %>% select(target, source, datestart, dateend, coexist, object, source_role, target_role)

save(bm_print_edges, file = "data/bm_print_edges.RData")
save(bm_print_nodes, file = "data/bm_print_nodes.RData")
save(bm_node_nationalities, file = "data/bm_node_nationalites.RData")

#### Generate subgraphs for network analysis ####
library(igraph)

# Get the node and edge lists for this window, where all edges are within the
# window, and all artists were born before the end of the window.
get_subgraph <- function(edges, nodes, index, alive_only = FALSE) {

  # Calculate start and end years
  start_year <- index - window_size
  end_year <- index + window_size

  # Find nodes and edges that ought to be active w/in the specified window.
  # Nodes must be at least 15yo by the end of the window (a generous estimate
  # for the youngest contributions of a given artist), while edges must be
  # artworks whose creation timespan intersects with the window, and whose
  # participants belong to the sub_node list

  if(alive_only) {
    sub_nodes <- nodes %>% filter((birth + 15) <= end_year & death >= start_year)
  } else {
    sub_nodes <- nodes %>% filter((birth + 15) <= end_year)
  }

  sub_edges <- edges %>%
    filter(
      datestart <= end_year &
        dateend >= start_year &
        source %in% sub_nodes$id &
        target %in% sub_nodes$id)

  # Generate igraph object from the edges and nodes data frames
  sub_graph <- graph.data.frame(sub_edges, directed = FALSE, vertices = sub_nodes)
  simp_sub_graph <- simplify(sub_graph, remove.multiple = TRUE)

  # Return a list containing the igraph object as well as the edges and nodes
  # data frames
  sub_object <- list("index" = index, "graph" = simp_sub_graph, "edges" = sub_edges, "nodes" = sub_nodes)
  return(sub_object)
}


# Create an empty list to hold the calculated subgraphs
all_subgraphs <- function(edges, nodes, role_types = c("After", "Printed", "Published"), alive_only = FALSE) {
  spec_edges <- edges %>% filter(source_role %in% role_types & target_role %in% role_types)
  print("Generating subgraphs...")
  lapply(daterange, function(x) get_subgraph(edges = spec_edges, nodes = nodes, index = x, alive_only = alive_only))
}

#### NETWORK METRICS ####

# Statistics calculated on the entire graph at once
global_stats <- function(graph, edges, nodes, index) {

  sub_trends <- data.frame(year = c(index))
  sub_trends$num_artists[1] <- length(V(graph))
  sub_trends$num_edges[1] <- length(E(graph))
  sub_trends$transitivity_global[1] <- transitivity(graph, type="global")
  sub_trends$diameter[1] <- diameter(graph)
  sub_trends$density[1] <- graph.density(graph, loops=TRUE)
  sub_trends$avg_path_length[1] <- average.path.length(graph)

  return(sub_trends)
}

# Statistics calculated per the nationality of the artist
national_stats <- function(graph, edges, nodes, index) {

  # Count works by nationality (This can only be done from the original edge
  # list, as the igraph object is flattened, without individual object
  # attributes)
  natl_works <- edges %>%
    select(id = source, object) %>%
    inner_join(bm_node_nationalities, by = "id") %>%
    group_by(nationality, object) %>%
    count(nationality) %>%
    select(nationality, num_works = n)

  # List the nationalities present in the current graph
  present_nationalities <- (bm_node_nationalities %>% filter(id %in% nodes$id) %>% select(nationality) %>% distinct())$nationality

  ##### Functions for creating different subsets of the full graph based on nationality
  # Vector of artist ids from a given nationality
  national_artists <- function(nat) {
    nat_artists <- bm_node_nationalities$id[bm_node_nationalities$nationality == nat]
  }

  national_nodes <- function(nat) {
    V(graph)[name %in% national_artists(nat)]
  }

  # Vector of artist ids NOT from a given nationality
  non_national_artists <- function(nat) {
    non_nat_artists <- setdiff(bm_node_nationalities$id, national_artists(nat))
  }

  # Calculates "national" subgraph by only including nodes of the given
  # nationality
  strict_nationality <- function(nat) {
    nat_nodes <- national_nodes(nat)
    nat_graph <- induced.subgraph(graph, nat_nodes)
    return(nat_graph)
  }

  # Calculates "national" subgraph by including any edges that have an artist
  # of the given nationality as either a source or a target
  proximal_nationality <- function(nat) {
    nat_nodes <- national_nodes(nat)
    nat_edges <- E(graph)[to(nat_nodes)]
    nat_graph <- subgraph.edges(graph, eids = nat_edges)
    return(nat_graph)
  }

  # Create lists of strict and proximal subgraphs for the present nationalities
  strict_groups <- lapply(present_nationalities, function(x) strict_nationality(x))
  proximal_groups <- lapply(present_nationalities, function(x) proximal_nationality(x))

  ### Calculate strict and proximal national statistics
  natl_stats <- data_frame(nationality = present_nationalities)

  # Number of artists (nodes)
  natl_stats$artists_strict <- vapply(strict_groups, function(x) vcount(x), FUN.VALUE = numeric(1))
  natl_stats$artists_proximal <- vapply(proximal_groups, function(x) vcount(x), FUN.VALUE = numeric(1))

  # Number of edges
  natl_stats$edges_strict <- vapply(strict_groups, function(x) ecount(x), FUN.VALUE = numeric(1))
  natl_stats$edges_proximal <- vapply(proximal_groups, function(x) ecount(x), FUN.VALUE = numeric(1))

  # Graph degree centrality
  natl_stats$nat_deg_cent_strict <- vapply(strict_groups, function(x) centr_degree(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
  natl_stats$nat_deg_cent_proximal <- vapply(proximal_groups, function(x) centr_degree(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))

  # Graph betweenness centrality
  natl_stats$nat_deg_bet_strict <- vapply(strict_groups, function(x) centr_betw(x, directed = FALSE, normalized=TRUE)$centralization, FUN.VALUE = numeric(1))
  natl_stats$nat_deg_bet_proximal <- vapply(proximal_groups, function(x) centr_betw(x, directed = FALSE, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))

  # Graph closeness centrality
  natl_stats$nat_deg_clos_strict <- vapply(strict_groups, function(x) centr_clo(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
  natl_stats$nat_deg_clos_proximal <- vapply(proximal_groups, function(x) centr_clo(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))

  # Graph transitivity
  natl_stats$transitivity_strict <- vapply(strict_groups, function(x) transitivity(x, type = "global"), FUN.VALUE = numeric(1))
  natl_stats$transitivity_proximal <- vapply(proximal_groups, function(x) transitivity(x, type = "global"), FUN.VALUE = numeric(1))

  # Graph density
  natl_stats$density_strict <- vapply(strict_groups, function(x) graph.density(x, loops = FALSE), FUN.VALUE = numeric(1))
  natl_stats$density_proximal <- vapply(proximal_groups, function(x) graph.density(x, loops = FALSE), FUN.VALUE = numeric(1))

  # Graph diameter
  natl_stats$diameter_strict <- vapply(strict_groups, function(x) diameter(x), FUN.VALUE = numeric(1))
  natl_stats$diameter_proximal <- vapply(proximal_groups, function(x) diameter(x), FUN.VALUE = numeric(1))

  # Graph avg path length
  natl_stats$avg_path_strict <- vapply(strict_groups, function(x) average.path.length(x), FUN.VALUE = numeric(1))
  natl_stats$avg_path_proximal <- vapply(proximal_groups, function(x) average.path.length(x), FUN.VALUE = numeric(1))


  # External-Internal Index
  # Function for calculating national-level EII for each nationality present
  calc_nat_ei <- function(nat) {

    nat_artists <- national_artists(nat)
    non_nat_artists <- non_national_artists(nat)

    # Get all vertices which have the same attribute values as val
    internal <- edges %>% filter(source %in% nat_artists & target %in% nat_artists) %>% select(source, target) %>% distinct()
    # Get all vertices which have different attribute values than val
    external <- edges %>% filter(source %in% nat_artists & target %in% non_nat_artists) %>% select(source, target) %>% distinct()

    # Count internal vs. external connecitons
    i <- nrow(internal)
    e <- nrow(external)
    return((e - i)/(e + i))
  }

  natl_stats$eii <- vapply(present_nationalities, function(x) calc_nat_ei(x), FUN.VALUE = numeric(1))

  # Join all statistics by nationality and add the year
  natl_stats %>% inner_join(natl_works, by = "nationality") %>% mutate(year = index)
}

comb_ei <- function(graph, edges, nodes, index) {

  # Find all nationalities present within the graph, and create a data frame with all their combinations
  present_nationalities <- (bm_node_nationalities %>% filter(id %in% nodes$id) %>% select(nationality) %>% distinct())$nationality
  nat_combos <- expand.grid(present_nationalities, present_nationalities) %>% rename(nat1 = Var1, nat2 = Var2)

  # Subset the nationality list for faster searching
  sub_nationalities <- bm_node_nationalities %>% filter(id %in% nodes$id)

  calc_combo_ei <- function(nat1, nat2) {

    # Find all nodes belonging to nat1
    nat1_nodes <- sub_nationalities %>% filter(nationality == nat1)
    nat2_nodes <- sub_nationalities %>% filter(nationality == nat2)

    # Find all edges originating in nodes belonging to nat1
    sub_edges <- edges %>% filter(source %in% nat1_nodes$id)

    # Find ties from nat1 to nat1
    internal <- sub_edges %>% filter(target %in% nat1_nodes$id)

    # Find ties from nat1 to ALL external nodes
    external <- sub_edges %>% filter(!(target %in% nat1_nodes$id))

    # Find ties from nat1 to nat2
    spec_external <- sub_edges %>% filter(target %in% nat2_nodes$id)

    # Count internal vs. external connecitons
    return(data_frame(internal = nrow(internal), external = nrow(external), spec_external = nrow(spec_external)))
  }

  # Merge the dataframes of every nationality combo into one dataframe, and
  # attach it to the nat_combos frame, adding a column for the year
  combo_frame <- mapply(function(x, y) calc_combo_ei(x, y), nat_combos$nat1, nat_combos$nat2, SIMPLIFY = FALSE) %>% bind_rows()
  nat_combos %>% bind_cols(combo_frame) %>% mutate(year = index)
}

individual_stats <- function(graph, edges, nodes, index) {

  names = V(graph)$name
  realnames = V(graph)$realname

  sub_ind_trends <- data_frame(id = names, realname = realnames, year = index)

  # Calculate vectorized statistics
  sub_ind_trends$total_degree <- degree(graph, mode = "total", normalized = FALSE)
  sub_ind_trends$total_degree_norm <- degree(graph, mode = "total", normalized = TRUE)
  sub_ind_trends$cent_betweenness <- betweenness(graph, directed = FALSE, normalized = TRUE)
  sub_ind_trends$cent_closeness <- closeness(graph, mode = "total", normalized = TRUE)
  sub_ind_trends$coreness <- coreness(graph, mode = "in")
  sub_ind_trends$local_cc <- transitivity(graph, type = "local")
  #sub_ind_trends$eigen <- centralization.evcent(graph, normalized = TRUE, directed = TRUE)$vector

  # Create a smaller reference list so individual node lookup is faster
  sub_nationalities <- nodes %>% inner_join(bm_node_nationalities, by = "id")

  # Get the individial group-external/group-internal index for all nodes
  calc_ind_ei <- function(node_id) {

    # Determine the nationalities of the given artist
    individual_nat <- sub_nationalities$nationality[sub_nationalities$id == node_id]

    # Find all edges for which the artist is a source
    ego_edges <- edges %>% filter(source == node_id)

    # Find artists who share that nationality
    shared_nat <- sub_nationalities %>% filter(nationality %in% individual_nat)

    if(nrow(ego_edges) == 0)
      return(NA)

    # Find all edges internal to given artist's nationalities
    internal_edges <- ego_edges %>% filter(target %in% shared_nat$id) %>% select(source, target) %>% distinct()

    # Find all edges external to given artist's nationalities
    external_edges <- ego_edges %>% filter(!(target %in% shared_nat$id)) %>% select(source, target) %>% distinct()

    # Calculate the group-external/group-internal index for this individual node
    i <- nrow(internal_edges)
    e <- nrow(external_edges)
    return((e - i)/(e + i))
  }

  sub_ind_trends$eii <- vapply(names, FUN.VALUE = numeric(1), function(x) calc_ind_ei(x))

  return(sub_ind_trends)
}

## Edge Centralities ----

edge_attributes <- function(graph, edges, nodes, index) {

  E(graph)$bet_cent <- edge_betweenness(graph, directed = FALSE)
  edf <- igraph::as_data_frame(graph, what = "edges")
  edf <- edf %>%
    rename(source = from, target = to) %>%
    mutate(year = index)
  return(edf)
}


library(foreach)
library(doParallel)

# Adjust based on number of cores
registerDoParallel(4)

all_analyses <- function(subgraphs) {
  print("Global trends...")
  global_trends <- (foreach(i = subgraphs) %dopar% global_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()

  print("National trends...")
  nat_trends <- (foreach(i = subgraphs) %dopar% national_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()

  #print("Binational EI Index...")
  #nat_combos <- (foreach(i = subgraphs) %dopar% comb_ei(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()

  print("Individual Trends...")
  ind_trends <- (foreach(i = subgraphs) %dopar% individual_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows() %>%
    # Allow more easily searchable graphs by adding nationality information to the
    # individual trends (this is important for finding the top most central
    # artists of a given nationality, for example)
    left_join(bm_print_nodes %>% select(id, birth, death), by = "id") %>%
    left_join(bm_node_nationalities, by = "id")

  print("Edge trends...")
  edge_trends <- (foreach(i = subgraphs) %dopar% edge_attributes(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows() %>%
    left_join(bm_print_edges, by = c("source", "target")) %>%
    inner_join(bm_print_nodes %>% select(id, source_name = realname, source_birth = birth, source_death = death), by = c("source" = "id")) %>%
    inner_join(bm_print_nodes %>% select(id, target_name = realname, target_birth = birth, target_death = death), by = c("target" = "id")) %>%
    left_join(bm_node_nationalities %>% rename(source_nat = nationality), by = c("source" = "id")) %>%
    left_join(bm_node_nationalities %>% rename(target_nat = nationality), by = c("target" = "id"))

  return(list(
    global_trends = global_trends,
    nat_trends = nat_trends,
    #nat_combos = nat_combos,
    ind_trends = ind_trends,
    edge_trends = edge_trends
    ))
}

#### Run all types of network analyses ####
## VERY COMPUTATIONALLY INTENSIVE - CAN TAKE SEVERAL HOURS TO RUN EVERY TYPE ##
# bm_APrPu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed", "Published"), alive_only = FALSE) %>% all_analyses()
# save(bm_APrPu_all, file = "data/bm_APrPu_all.RData")

bm_APrPu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed", "Published"), alive_only = TRUE) %>% all_analyses()
save(bm_APrPu_alive, file = "data/bm_APrPu_alive.RData")

# bm_PrPu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("Printed", "Published"), alive_only = FALSE) %>% all_analyses()
# save(bm_PrPu_all, file = "data/bm_PrPu_all.RData")

bm_PrPu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("Printed", "Published"), alive_only = TRUE) %>% all_analyses()
save(bm_PrPu_alive, file = "data/bm_PrPu_alive.RData")

# bm_APr_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed"), alive_only = FALSE) %>% all_analyses()
# save(bm_APr_all, file = "data/bm_APr_all.RData")

bm_APr_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed"), alive_only = TRUE) %>% all_analyses()
save(bm_APr_alive, file = "data/bm_APr_alive.RData")

# bm_APu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Published"), alive_only = FALSE) %>% all_analyses()
# save(bm_APu_all, file = "data/bm_APu_all.RData")

bm_APu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Published"), alive_only = TRUE) %>% all_analyses()
save(bm_APu_alive, file = "data/bm_APu_alive.RData")

## centralization_plot.R
library(lincolnplots)
library(lincolndissdata)
library(dplyr)
library(tidyr)
library(ggplot2)

rkm_flem_stats <- rkm_APrPu_alive$nat_trends %>% filter(nationality == "Zuid-Nederlands") %>% mutate(nationality = "Flemish")
bm_flem_stats <- bm_APrPu_alive$nat_trends %>% filter(nationality == "Flemish")

all_stats <- bind_rows(
  "BM" = bm_flem_stats,
  "RKM" = rkm_flem_stats,
  .id = "dataset"
)

flem_cent_counts <- all_stats %>%
  select(year, dataset, centrality = nat_deg_cent_strict, nodes = artists_strict, edges = edges_strict) %>%
  gather(metric, value, centrality, nodes, edges) %>%
  ggplot(aes(x = year, y = value, color = dataset)) +
  facet_wrap(~ metric, ncol = 1, scales = "free_y") +
  geom_line(size = 1) +
  theme_bw() +
  scale_color_grey() +
  theme(legend.position = "bottom") +
  labs(x = NULL, y = NULL) +
  xlim(1500, 1750)

pdfplot(flem_cent_counts, "flem_cent_counts", width = 6, height = 7)
	library(dplyr)
	library(lubridate)

	#### Load raw output from database queries ####
	bm_raw_print_nodes <- read.csv("data-raw/bm_print_nodes.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_raw_print_edges <- read.csv("data-raw/bm_print_edges.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_node_nationalities <- read.csv("data-raw/bm_node_nationalities.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_actor_notes <- read.csv("data-raw/bm_actor_notes.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_oa <- read.csv("data-raw/bm_obj_attributes.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_thesaurus <- read.csv("data-raw/bm_thesaurus.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_types <- read.csv("data-raw/bm_types.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_concordance <- read.csv("data-raw/bm_concordance.csv", stringsAsFactors = FALSE, na.strings = "")
	bm_images <- read.csv("data-raw/bm_images.csv", header = FALSE, col.names = c("image", "object"), stringsAsFactors = FALSE, na.strings = "")
	bm_pids <- read.csv("data-raw/bm_pids.csv", header = FALSE, col.names = c("object", "pid"), stringsAsFactors = FALSE, na.strings = "", skip = 1)


	save(bm_concordance, file = "data/bm_concordance.RData")
	save(bm_thesaurus, file = "data/bm_thesaurus.RData")
	save(bm_types, file = "data/bm_types.RData")
	save(bm_images, file = "data/bm_images.RData")
	save(bm_pids, file = "data/bm_pids.RData")
	save(bm_actor_notes, file = "data/bm_actor_notes.RData")
	bm_oa <- bm_oa %>%
	mutate(
	datestart = ymd(datestart),
	dateend = ymd(dateend),
	year_start = year(datestart),
	year_end = year(dateend)
	) %>%
	left_join(bm_images, by = "object")
	save(bm_oa, file = "data/bm_oa.RData")


	#### SET ANALYSIS DEFAULTS ####
	daterange <- 1490:1760
	window_size <- 5

	#### Filter print_nodes and print_edges based on presets ####

	# Parse date fields into year integers (very litte information is lost, as most
	# dates are already of the format 'XXXX-01-01). This makes it much easier to
	# filter nodes and edges based on date
	bm_print_nodes <- bm_raw_print_nodes %>%
	mutate(birth = year(ymd(birth)), death = year(ymd(death)), name = id) %>%
	# Because igraph essentially erases the 'id' node attribute when constructing
	# graphs, copy those values into a column called 'name'; for the actual name,
	# uses 'realname'
	select(id, name, realname, birth, death) %>%
	# There are a few nultiple birth dates and death dates - this removes them,
	# keeping id a unique column
	distinct(id)

	bm_print_edges <- bm_raw_print_edges %>% mutate(datestart = year(ymd(datestart)), dateend = year(ymd(dateend)))

	# Set any nodes with "English" nationality to "British"
	bm_node_nationalities$nationality[bm_node_nationalities$nationality == "British"] <- "English"

	# Create a new pseudo-nationality for "Low Countries" that incorporates both
	# Flemish and Dutch actors, and add it to the nationality table
	lowcountries <- bm_node_nationalities %>%
	filter(nationality %in% c("Flemish", "Dutch")) %>%
	mutate(nationality = "Low Countries")

	bm_node_nationalities <- bm_node_nationalities %>% bind_rows(lowcountries)

	# Filter by date range, discarding any works finished before the start of
	# analysis range or completed before the end of the analysis range
	start_year <- first(daterange)
	end_year <- last(daterange)
	bm_print_edges <- bm_print_edges %>% filter(datestart <= end_year & start_year <= dateend)

	# Join the birth and death date for the source and target artists of each edge
	bm_print_edges <- bm_print_edges %>%
	inner_join(bm_print_nodes %>% select(id, source_birth = birth, source_death = death), by = c("source" = "id")) %>%
	inner_join(bm_print_nodes %>% select(id, target_birth = birth, target_death = death), by = c("target" = "id"))

	# For each edge, check to see if the artists' lifespans ever overlapped. If so,
	# artists are said to have coexisted, and thus edges between them will be set to
	# bidirectional. If not, edges will only be one-way: with the contemporary
	# artist as source and the past artist as target.
	coexist <- Vectorize(function(a1b, a1d, a2b, a2d) {
	ifelse(
	anyNA(c(a1b, a1d, a2b, a2d)),
	FALSE, # If any artist dates are NA, return FALSE
	any((a1b + 15):a1d %in% (a2b + 15):a2d) # IF not, then figure out if they overlap
	)
	})

	bm_print_edges <- bm_print_edges %>% mutate(coexist = coexist(source_birth, source_death, target_birth, target_death))


	preferred_types <- c("Printed", "After", "Published")
	bm_print_edges <- bm_print_edges %>% filter(source_role %in% preferred_types & target_role %in% preferred_types)

	# Select final variables to pass to analysis
	bm_print_edges <- bm_print_edges %>% select(target, source, datestart, dateend, coexist, object, source_role, target_role)

	save(bm_print_edges, file = "data/bm_print_edges.RData")
	save(bm_print_nodes, file = "data/bm_print_nodes.RData")
	save(bm_node_nationalities, file = "data/bm_node_nationalites.RData")

	#### Generate subgraphs for network analysis ####
	library(igraph)

	# Get the node and edge lists for this window, where all edges are within the
	# window, and all artists were born before the end of the window.
	get_subgraph <- function(edges, nodes, index, alive_only = FALSE) {

	# Calculate start and end years
	start_year <- index - window_size
	end_year <- index + window_size

	# Find nodes and edges that ought to be active w/in the specified window.
	# Nodes must be at least 15yo by the end of the window (a generous estimate
	# for the youngest contributions of a given artist), while edges must be
	# artworks whose creation timespan intersects with the window, and whose
	# participants belong to the sub_node list

	if(alive_only) {
	sub_nodes <- nodes %>% filter((birth + 15) <= end_year & death >= start_year)
	} else {
	sub_nodes <- nodes %>% filter((birth + 15) <= end_year)
	}

	sub_edges <- edges %>%
	filter(
	datestart <= end_year &
	dateend >= start_year &
	source %in% sub_nodes$id &
	target %in% sub_nodes$id)

	# Generate igraph object from the edges and nodes data frames
	sub_graph <- graph.data.frame(sub_edges, directed = FALSE, vertices = sub_nodes)
	simp_sub_graph <- simplify(sub_graph, remove.multiple = TRUE)

	# Return a list containing the igraph object as well as the edges and nodes
	# data frames
	sub_object <- list("index" = index, "graph" = simp_sub_graph, "edges" = sub_edges, "nodes" = sub_nodes)
	return(sub_object)
	}


	# Create an empty list to hold the calculated subgraphs
	all_subgraphs <- function(edges, nodes, role_types = c("After", "Printed", "Published"), alive_only = FALSE) {
	spec_edges <- edges %>% filter(source_role %in% role_types & target_role %in% role_types)
	print("Generating subgraphs...")
	lapply(daterange, function(x) get_subgraph(edges = spec_edges, nodes = nodes, index = x, alive_only = alive_only))
	}

	#### NETWORK METRICS ####

	# Statistics calculated on the entire graph at once
	global_stats <- function(graph, edges, nodes, index) {

	sub_trends <- data.frame(year = c(index))
	sub_trends$num_artists[1] <- length(V(graph))
	sub_trends$num_edges[1] <- length(E(graph))
	sub_trends$transitivity_global[1] <- transitivity(graph, type="global")
	sub_trends$diameter[1] <- diameter(graph)
	sub_trends$density[1] <- graph.density(graph, loops=TRUE)
	sub_trends$avg_path_length[1] <- average.path.length(graph)

	return(sub_trends)
	}

	# Statistics calculated per the nationality of the artist
	national_stats <- function(graph, edges, nodes, index) {

	# Count works by nationality (This can only be done from the original edge
	# list, as the igraph object is flattened, without individual object
	# attributes)
	natl_works <- edges %>%
	select(id = source, object) %>%
	inner_join(bm_node_nationalities, by = "id") %>%
	group_by(nationality, object) %>%
	count(nationality) %>%
	select(nationality, num_works = n)

	# List the nationalities present in the current graph
	present_nationalities <- (bm_node_nationalities %>% filter(id %in% nodes$id) %>% select(nationality) %>% distinct())$nationality

	##### Functions for creating different subsets of the full graph based on nationality
	# Vector of artist ids from a given nationality
	national_artists <- function(nat) {
	nat_artists <- bm_node_nationalities$id[bm_node_nationalities$nationality == nat]
	}

	national_nodes <- function(nat) {
	V(graph)[name %in% national_artists(nat)]
	}

	# Vector of artist ids NOT from a given nationality
	non_national_artists <- function(nat) {
	non_nat_artists <- setdiff(bm_node_nationalities$id, national_artists(nat))
	}

	# Calculates "national" subgraph by only including nodes of the given
	# nationality
	strict_nationality <- function(nat) {
	nat_nodes <- national_nodes(nat)
	nat_graph <- induced.subgraph(graph, nat_nodes)
	return(nat_graph)
	}

	# Calculates "national" subgraph by including any edges that have an artist
	# of the given nationality as either a source or a target
	proximal_nationality <- function(nat) {
	nat_nodes <- national_nodes(nat)
	nat_edges <- E(graph)[to(nat_nodes)]
	nat_graph <- subgraph.edges(graph, eids = nat_edges)
	return(nat_graph)
	}

	# Create lists of strict and proximal subgraphs for the present nationalities
	strict_groups <- lapply(present_nationalities, function(x) strict_nationality(x))
	proximal_groups <- lapply(present_nationalities, function(x) proximal_nationality(x))

	### Calculate strict and proximal national statistics
	natl_stats <- data_frame(nationality = present_nationalities)

	# Number of artists (nodes)
	natl_stats$artists_strict <- vapply(strict_groups, function(x) vcount(x), FUN.VALUE = numeric(1))
	natl_stats$artists_proximal <- vapply(proximal_groups, function(x) vcount(x), FUN.VALUE = numeric(1))

	# Number of edges
	natl_stats$edges_strict <- vapply(strict_groups, function(x) ecount(x), FUN.VALUE = numeric(1))
	natl_stats$edges_proximal <- vapply(proximal_groups, function(x) ecount(x), FUN.VALUE = numeric(1))

	# Graph degree centrality
	natl_stats$nat_deg_cent_strict <- vapply(strict_groups, function(x) centr_degree(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
	natl_stats$nat_deg_cent_proximal <- vapply(proximal_groups, function(x) centr_degree(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))

	# Graph betweenness centrality
	natl_stats$nat_deg_bet_strict <- vapply(strict_groups, function(x) centr_betw(x, directed = FALSE, normalized=TRUE)$centralization, FUN.VALUE = numeric(1))
	natl_stats$nat_deg_bet_proximal <- vapply(proximal_groups, function(x) centr_betw(x, directed = FALSE, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))

	# Graph closeness centrality
	natl_stats$nat_deg_clos_strict <- vapply(strict_groups, function(x) centr_clo(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
	natl_stats$nat_deg_clos_proximal <- vapply(proximal_groups, function(x) centr_clo(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))

	# Graph transitivity
	natl_stats$transitivity_strict <- vapply(strict_groups, function(x) transitivity(x, type = "global"), FUN.VALUE = numeric(1))
	natl_stats$transitivity_proximal <- vapply(proximal_groups, function(x) transitivity(x, type = "global"), FUN.VALUE = numeric(1))

	# Graph density
	natl_stats$density_strict <- vapply(strict_groups, function(x) graph.density(x, loops = FALSE), FUN.VALUE = numeric(1))
	natl_stats$density_proximal <- vapply(proximal_groups, function(x) graph.density(x, loops = FALSE), FUN.VALUE = numeric(1))

	# Graph diameter
	natl_stats$diameter_strict <- vapply(strict_groups, function(x) diameter(x), FUN.VALUE = numeric(1))
	natl_stats$diameter_proximal <- vapply(proximal_groups, function(x) diameter(x), FUN.VALUE = numeric(1))

	# Graph avg path length
	natl_stats$avg_path_strict <- vapply(strict_groups, function(x) average.path.length(x), FUN.VALUE = numeric(1))
	natl_stats$avg_path_proximal <- vapply(proximal_groups, function(x) average.path.length(x), FUN.VALUE = numeric(1))


	# External-Internal Index
	# Function for calculating national-level EII for each nationality present
	calc_nat_ei <- function(nat) {

	nat_artists <- national_artists(nat)
	non_nat_artists <- non_national_artists(nat)

	# Get all vertices which have the same attribute values as val
	internal <- edges %>% filter(source %in% nat_artists & target %in% nat_artists) %>% select(source, target) %>% distinct()
	# Get all vertices which have different attribute values than val
	external <- edges %>% filter(source %in% nat_artists & target %in% non_nat_artists) %>% select(source, target) %>% distinct()

	# Count internal vs. external connecitons
	i <- nrow(internal)
	e <- nrow(external)
	return((e - i)/(e + i))
	}

	natl_stats$eii <- vapply(present_nationalities, function(x) calc_nat_ei(x), FUN.VALUE = numeric(1))

	# Join all statistics by nationality and add the year
	natl_stats %>% inner_join(natl_works, by = "nationality") %>% mutate(year = index)
	}

	comb_ei <- function(graph, edges, nodes, index) {

	# Find all nationalities present within the graph, and create a data frame with all their combinations
	present_nationalities <- (bm_node_nationalities %>% filter(id %in% nodes$id) %>% select(nationality) %>% distinct())$nationality
	nat_combos <- expand.grid(present_nationalities, present_nationalities) %>% rename(nat1 = Var1, nat2 = Var2)

	# Subset the nationality list for faster searching
	sub_nationalities <- bm_node_nationalities %>% filter(id %in% nodes$id)

	calc_combo_ei <- function(nat1, nat2) {

	# Find all nodes belonging to nat1
	nat1_nodes <- sub_nationalities %>% filter(nationality == nat1)
	nat2_nodes <- sub_nationalities %>% filter(nationality == nat2)

	# Find all edges originating in nodes belonging to nat1
	sub_edges <- edges %>% filter(source %in% nat1_nodes$id)

	# Find ties from nat1 to nat1
	internal <- sub_edges %>% filter(target %in% nat1_nodes$id)

	# Find ties from nat1 to ALL external nodes
	external <- sub_edges %>% filter(!(target %in% nat1_nodes$id))

	# Find ties from nat1 to nat2
	spec_external <- sub_edges %>% filter(target %in% nat2_nodes$id)

	# Count internal vs. external connecitons
	return(data_frame(internal = nrow(internal), external = nrow(external), spec_external = nrow(spec_external)))
	}

	# Merge the dataframes of every nationality combo into one dataframe, and
	# attach it to the nat_combos frame, adding a column for the year
	combo_frame <- mapply(function(x, y) calc_combo_ei(x, y), nat_combos$nat1, nat_combos$nat2, SIMPLIFY = FALSE) %>% bind_rows()
	nat_combos %>% bind_cols(combo_frame) %>% mutate(year = index)
	}

	individual_stats <- function(graph, edges, nodes, index) {

	names = V(graph)$name
	realnames = V(graph)$realname

	sub_ind_trends <- data_frame(id = names, realname = realnames, year = index)

	# Calculate vectorized statistics
	sub_ind_trends$total_degree <- degree(graph, mode = "total", normalized = FALSE)
	sub_ind_trends$total_degree_norm <- degree(graph, mode = "total", normalized = TRUE)
	sub_ind_trends$cent_betweenness <- betweenness(graph, directed = FALSE, normalized = TRUE)
	sub_ind_trends$cent_closeness <- closeness(graph, mode = "total", normalized = TRUE)
	sub_ind_trends$coreness <- coreness(graph, mode = "in")
	sub_ind_trends$local_cc <- transitivity(graph, type = "local")
	#sub_ind_trends$eigen <- centralization.evcent(graph, normalized = TRUE, directed = TRUE)$vector

	# Create a smaller reference list so individual node lookup is faster
	sub_nationalities <- nodes %>% inner_join(bm_node_nationalities, by = "id")

	# Get the individial group-external/group-internal index for all nodes
	calc_ind_ei <- function(node_id) {

	# Determine the nationalities of the given artist
	individual_nat <- sub_nationalities$nationality[sub_nationalities$id == node_id]

	# Find all edges for which the artist is a source
	ego_edges <- edges %>% filter(source == node_id)

	# Find artists who share that nationality
	shared_nat <- sub_nationalities %>% filter(nationality %in% individual_nat)

	if(nrow(ego_edges) == 0)
	return(NA)

	# Find all edges internal to given artist's nationalities
	internal_edges <- ego_edges %>% filter(target %in% shared_nat$id) %>% select(source, target) %>% distinct()

	# Find all edges external to given artist's nationalities
	external_edges <- ego_edges %>% filter(!(target %in% shared_nat$id)) %>% select(source, target) %>% distinct()

	# Calculate the group-external/group-internal index for this individual node
	i <- nrow(internal_edges)
	e <- nrow(external_edges)
	return((e - i)/(e + i))
	}

	sub_ind_trends$eii <- vapply(names, FUN.VALUE = numeric(1), function(x) calc_ind_ei(x))

	return(sub_ind_trends)
	}

	## Edge Centralities ----

	edge_attributes <- function(graph, edges, nodes, index) {

	E(graph)$bet_cent <- edge_betweenness(graph, directed = FALSE)
	edf <- igraph::as_data_frame(graph, what = "edges")
	edf <- edf %>%
	rename(source = from, target = to) %>%
	mutate(year = index)
	return(edf)
	}


	library(foreach)
	library(doParallel)

	# Adjust based on number of cores
	registerDoParallel(4)

	all_analyses <- function(subgraphs) {
	print("Global trends...")
	global_trends <- (foreach(i = subgraphs) %dopar% global_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()

	print("National trends...")
	nat_trends <- (foreach(i = subgraphs) %dopar% national_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()

	#print("Binational EI Index...")
	#nat_combos <- (foreach(i = subgraphs) %dopar% comb_ei(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()

	print("Individual Trends...")
	ind_trends <- (foreach(i = subgraphs) %dopar% individual_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows() %>%
	# Allow more easily searchable graphs by adding nationality information to the
	# individual trends (this is important for finding the top most central
	# artists of a given nationality, for example)
	left_join(bm_print_nodes %>% select(id, birth, death), by = "id") %>%
	left_join(bm_node_nationalities, by = "id")

	print("Edge trends...")
	edge_trends <- (foreach(i = subgraphs) %dopar% edge_attributes(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows() %>%
	left_join(bm_print_edges, by = c("source", "target")) %>%
	inner_join(bm_print_nodes %>% select(id, source_name = realname, source_birth = birth, source_death = death), by = c("source" = "id")) %>%
	inner_join(bm_print_nodes %>% select(id, target_name = realname, target_birth = birth, target_death = death), by = c("target" = "id")) %>%
	left_join(bm_node_nationalities %>% rename(source_nat = nationality), by = c("source" = "id")) %>%
	left_join(bm_node_nationalities %>% rename(target_nat = nationality), by = c("target" = "id"))

	return(list(
	global_trends = global_trends,
	nat_trends = nat_trends,
	#nat_combos = nat_combos,
	ind_trends = ind_trends,
	edge_trends = edge_trends
	))
	}

	#### Run all types of network analyses ####
	## VERY COMPUTATIONALLY INTENSIVE - CAN TAKE SEVERAL HOURS TO RUN EVERY TYPE ##
	# bm_APrPu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed", "Published"), alive_only = FALSE) %>% all_analyses()
	# save(bm_APrPu_all, file = "data/bm_APrPu_all.RData")

	bm_APrPu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed", "Published"), alive_only = TRUE) %>% all_analyses()
	save(bm_APrPu_alive, file = "data/bm_APrPu_alive.RData")

	# bm_PrPu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("Printed", "Published"), alive_only = FALSE) %>% all_analyses()
	# save(bm_PrPu_all, file = "data/bm_PrPu_all.RData")

	bm_PrPu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("Printed", "Published"), alive_only = TRUE) %>% all_analyses()
	save(bm_PrPu_alive, file = "data/bm_PrPu_alive.RData")

	# bm_APr_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed"), alive_only = FALSE) %>% all_analyses()
	# save(bm_APr_all, file = "data/bm_APr_all.RData")

	bm_APr_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed"), alive_only = TRUE) %>% all_analyses()
	save(bm_APr_alive, file = "data/bm_APr_alive.RData")

	# bm_APu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Published"), alive_only = FALSE) %>% all_analyses()
	# save(bm_APu_all, file = "data/bm_APu_all.RData")

	bm_APu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Published"), alive_only = TRUE) %>% all_analyses()
	save(bm_APu_alive, file = "data/bm_APu_alive.RData")
	library(lincolnplots)
	library(lincolndissdata)
	library(dplyr)
	library(tidyr)
	library(ggplot2)

	rkm_flem_stats <- rkm_APrPu_alive$nat_trends %>% filter(nationality == "Zuid-Nederlands") %>% mutate(nationality = "Flemish")
	bm_flem_stats <- bm_APrPu_alive$nat_trends %>% filter(nationality == "Flemish")

	all_stats <- bind_rows(
	"BM" = bm_flem_stats,
	"RKM" = rkm_flem_stats,
	.id = "dataset"
	)

	flem_cent_counts <- all_stats %>%
	select(year, dataset, centrality = nat_deg_cent_strict, nodes = artists_strict, edges = edges_strict) %>%
	gather(metric, value, centrality, nodes, edges) %>%
	ggplot(aes(x = year, y = value, color = dataset)) +
	facet_wrap(~ metric, ncol = 1, scales = "free_y") +
	geom_line(size = 1) +
	theme_bw() +
	scale_color_grey() +
	theme(legend.position = "bottom") +
	labs(x = NULL, y = NULL) +
	xlim(1500, 1750)

	pdfplot(flem_cent_counts, "flem_cent_counts", width = 6, height = 7)