Skip to content

Instantly share code, notes, and snippets.

@mdlincoln

mdlincoln/bm.R Secret

Last active February 14, 2016 23:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdlincoln/9ba3b6c580f110abadd8 to your computer and use it in GitHub Desktop.
Save mdlincoln/9ba3b6c580f110abadd8 to your computer and use it in GitHub Desktop.
R code from Matthew Lincoln's dissertation
library(dplyr)
library(lubridate)
#### Load raw output from database queries ####
bm_raw_print_nodes <- read.csv("data-raw/bm_print_nodes.csv", stringsAsFactors = FALSE, na.strings = "")
bm_raw_print_edges <- read.csv("data-raw/bm_print_edges.csv", stringsAsFactors = FALSE, na.strings = "")
bm_node_nationalities <- read.csv("data-raw/bm_node_nationalities.csv", stringsAsFactors = FALSE, na.strings = "")
bm_actor_notes <- read.csv("data-raw/bm_actor_notes.csv", stringsAsFactors = FALSE, na.strings = "")
bm_oa <- read.csv("data-raw/bm_obj_attributes.csv", stringsAsFactors = FALSE, na.strings = "")
bm_thesaurus <- read.csv("data-raw/bm_thesaurus.csv", stringsAsFactors = FALSE, na.strings = "")
bm_types <- read.csv("data-raw/bm_types.csv", stringsAsFactors = FALSE, na.strings = "")
bm_concordance <- read.csv("data-raw/bm_concordance.csv", stringsAsFactors = FALSE, na.strings = "")
bm_images <- read.csv("data-raw/bm_images.csv", header = FALSE, col.names = c("image", "object"), stringsAsFactors = FALSE, na.strings = "")
bm_pids <- read.csv("data-raw/bm_pids.csv", header = FALSE, col.names = c("object", "pid"), stringsAsFactors = FALSE, na.strings = "", skip = 1)
save(bm_concordance, file = "data/bm_concordance.RData")
save(bm_thesaurus, file = "data/bm_thesaurus.RData")
save(bm_types, file = "data/bm_types.RData")
save(bm_images, file = "data/bm_images.RData")
save(bm_pids, file = "data/bm_pids.RData")
save(bm_actor_notes, file = "data/bm_actor_notes.RData")
bm_oa <- bm_oa %>%
mutate(
datestart = ymd(datestart),
dateend = ymd(dateend),
year_start = year(datestart),
year_end = year(dateend)
) %>%
left_join(bm_images, by = "object")
save(bm_oa, file = "data/bm_oa.RData")
#### SET ANALYSIS DEFAULTS ####
daterange <- 1490:1760
window_size <- 5
#### Filter print_nodes and print_edges based on presets ####
# Parse date fields into year integers (very litte information is lost, as most
# dates are already of the format 'XXXX-01-01). This makes it much easier to
# filter nodes and edges based on date
bm_print_nodes <- bm_raw_print_nodes %>%
mutate(birth = year(ymd(birth)), death = year(ymd(death)), name = id) %>%
# Because igraph essentially erases the 'id' node attribute when constructing
# graphs, copy those values into a column called 'name'; for the actual name,
# uses 'realname'
select(id, name, realname, birth, death) %>%
# There are a few nultiple birth dates and death dates - this removes them,
# keeping id a unique column
distinct(id)
bm_print_edges <- bm_raw_print_edges %>% mutate(datestart = year(ymd(datestart)), dateend = year(ymd(dateend)))
# Set any nodes with "English" nationality to "British"
bm_node_nationalities$nationality[bm_node_nationalities$nationality == "British"] <- "English"
# Create a new pseudo-nationality for "Low Countries" that incorporates both
# Flemish and Dutch actors, and add it to the nationality table
lowcountries <- bm_node_nationalities %>%
filter(nationality %in% c("Flemish", "Dutch")) %>%
mutate(nationality = "Low Countries")
bm_node_nationalities <- bm_node_nationalities %>% bind_rows(lowcountries)
# Filter by date range, discarding any works finished before the start of
# analysis range or completed before the end of the analysis range
start_year <- first(daterange)
end_year <- last(daterange)
bm_print_edges <- bm_print_edges %>% filter(datestart <= end_year & start_year <= dateend)
# Join the birth and death date for the source and target artists of each edge
bm_print_edges <- bm_print_edges %>%
inner_join(bm_print_nodes %>% select(id, source_birth = birth, source_death = death), by = c("source" = "id")) %>%
inner_join(bm_print_nodes %>% select(id, target_birth = birth, target_death = death), by = c("target" = "id"))
# For each edge, check to see if the artists' lifespans ever overlapped. If so,
# artists are said to have coexisted, and thus edges between them will be set to
# bidirectional. If not, edges will only be one-way: with the contemporary
# artist as source and the past artist as target.
coexist <- Vectorize(function(a1b, a1d, a2b, a2d) {
ifelse(
anyNA(c(a1b, a1d, a2b, a2d)),
FALSE, # If any artist dates are NA, return FALSE
any((a1b + 15):a1d %in% (a2b + 15):a2d) # IF not, then figure out if they overlap
)
})
bm_print_edges <- bm_print_edges %>% mutate(coexist = coexist(source_birth, source_death, target_birth, target_death))
preferred_types <- c("Printed", "After", "Published")
bm_print_edges <- bm_print_edges %>% filter(source_role %in% preferred_types & target_role %in% preferred_types)
# Select final variables to pass to analysis
bm_print_edges <- bm_print_edges %>% select(target, source, datestart, dateend, coexist, object, source_role, target_role)
save(bm_print_edges, file = "data/bm_print_edges.RData")
save(bm_print_nodes, file = "data/bm_print_nodes.RData")
save(bm_node_nationalities, file = "data/bm_node_nationalites.RData")
#### Generate subgraphs for network analysis ####
library(igraph)
# Get the node and edge lists for this window, where all edges are within the
# window, and all artists were born before the end of the window.
get_subgraph <- function(edges, nodes, index, alive_only = FALSE) {
# Calculate start and end years
start_year <- index - window_size
end_year <- index + window_size
# Find nodes and edges that ought to be active w/in the specified window.
# Nodes must be at least 15yo by the end of the window (a generous estimate
# for the youngest contributions of a given artist), while edges must be
# artworks whose creation timespan intersects with the window, and whose
# participants belong to the sub_node list
if(alive_only) {
sub_nodes <- nodes %>% filter((birth + 15) <= end_year & death >= start_year)
} else {
sub_nodes <- nodes %>% filter((birth + 15) <= end_year)
}
sub_edges <- edges %>%
filter(
datestart <= end_year &
dateend >= start_year &
source %in% sub_nodes$id &
target %in% sub_nodes$id)
# Generate igraph object from the edges and nodes data frames
sub_graph <- graph.data.frame(sub_edges, directed = FALSE, vertices = sub_nodes)
simp_sub_graph <- simplify(sub_graph, remove.multiple = TRUE)
# Return a list containing the igraph object as well as the edges and nodes
# data frames
sub_object <- list("index" = index, "graph" = simp_sub_graph, "edges" = sub_edges, "nodes" = sub_nodes)
return(sub_object)
}
# Create an empty list to hold the calculated subgraphs
all_subgraphs <- function(edges, nodes, role_types = c("After", "Printed", "Published"), alive_only = FALSE) {
spec_edges <- edges %>% filter(source_role %in% role_types & target_role %in% role_types)
print("Generating subgraphs...")
lapply(daterange, function(x) get_subgraph(edges = spec_edges, nodes = nodes, index = x, alive_only = alive_only))
}
#### NETWORK METRICS ####
# Statistics calculated on the entire graph at once
global_stats <- function(graph, edges, nodes, index) {
sub_trends <- data.frame(year = c(index))
sub_trends$num_artists[1] <- length(V(graph))
sub_trends$num_edges[1] <- length(E(graph))
sub_trends$transitivity_global[1] <- transitivity(graph, type="global")
sub_trends$diameter[1] <- diameter(graph)
sub_trends$density[1] <- graph.density(graph, loops=TRUE)
sub_trends$avg_path_length[1] <- average.path.length(graph)
return(sub_trends)
}
# Statistics calculated per the nationality of the artist
national_stats <- function(graph, edges, nodes, index) {
# Count works by nationality (This can only be done from the original edge
# list, as the igraph object is flattened, without individual object
# attributes)
natl_works <- edges %>%
select(id = source, object) %>%
inner_join(bm_node_nationalities, by = "id") %>%
group_by(nationality, object) %>%
count(nationality) %>%
select(nationality, num_works = n)
# List the nationalities present in the current graph
present_nationalities <- (bm_node_nationalities %>% filter(id %in% nodes$id) %>% select(nationality) %>% distinct())$nationality
##### Functions for creating different subsets of the full graph based on nationality
# Vector of artist ids from a given nationality
national_artists <- function(nat) {
nat_artists <- bm_node_nationalities$id[bm_node_nationalities$nationality == nat]
}
national_nodes <- function(nat) {
V(graph)[name %in% national_artists(nat)]
}
# Vector of artist ids NOT from a given nationality
non_national_artists <- function(nat) {
non_nat_artists <- setdiff(bm_node_nationalities$id, national_artists(nat))
}
# Calculates "national" subgraph by only including nodes of the given
# nationality
strict_nationality <- function(nat) {
nat_nodes <- national_nodes(nat)
nat_graph <- induced.subgraph(graph, nat_nodes)
return(nat_graph)
}
# Calculates "national" subgraph by including any edges that have an artist
# of the given nationality as either a source or a target
proximal_nationality <- function(nat) {
nat_nodes <- national_nodes(nat)
nat_edges <- E(graph)[to(nat_nodes)]
nat_graph <- subgraph.edges(graph, eids = nat_edges)
return(nat_graph)
}
# Create lists of strict and proximal subgraphs for the present nationalities
strict_groups <- lapply(present_nationalities, function(x) strict_nationality(x))
proximal_groups <- lapply(present_nationalities, function(x) proximal_nationality(x))
### Calculate strict and proximal national statistics
natl_stats <- data_frame(nationality = present_nationalities)
# Number of artists (nodes)
natl_stats$artists_strict <- vapply(strict_groups, function(x) vcount(x), FUN.VALUE = numeric(1))
natl_stats$artists_proximal <- vapply(proximal_groups, function(x) vcount(x), FUN.VALUE = numeric(1))
# Number of edges
natl_stats$edges_strict <- vapply(strict_groups, function(x) ecount(x), FUN.VALUE = numeric(1))
natl_stats$edges_proximal <- vapply(proximal_groups, function(x) ecount(x), FUN.VALUE = numeric(1))
# Graph degree centrality
natl_stats$nat_deg_cent_strict <- vapply(strict_groups, function(x) centr_degree(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
natl_stats$nat_deg_cent_proximal <- vapply(proximal_groups, function(x) centr_degree(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
# Graph betweenness centrality
natl_stats$nat_deg_bet_strict <- vapply(strict_groups, function(x) centr_betw(x, directed = FALSE, normalized=TRUE)$centralization, FUN.VALUE = numeric(1))
natl_stats$nat_deg_bet_proximal <- vapply(proximal_groups, function(x) centr_betw(x, directed = FALSE, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
# Graph closeness centrality
natl_stats$nat_deg_clos_strict <- vapply(strict_groups, function(x) centr_clo(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
natl_stats$nat_deg_clos_proximal <- vapply(proximal_groups, function(x) centr_clo(x, normalized = TRUE)$centralization, FUN.VALUE = numeric(1))
# Graph transitivity
natl_stats$transitivity_strict <- vapply(strict_groups, function(x) transitivity(x, type = "global"), FUN.VALUE = numeric(1))
natl_stats$transitivity_proximal <- vapply(proximal_groups, function(x) transitivity(x, type = "global"), FUN.VALUE = numeric(1))
# Graph density
natl_stats$density_strict <- vapply(strict_groups, function(x) graph.density(x, loops = FALSE), FUN.VALUE = numeric(1))
natl_stats$density_proximal <- vapply(proximal_groups, function(x) graph.density(x, loops = FALSE), FUN.VALUE = numeric(1))
# Graph diameter
natl_stats$diameter_strict <- vapply(strict_groups, function(x) diameter(x), FUN.VALUE = numeric(1))
natl_stats$diameter_proximal <- vapply(proximal_groups, function(x) diameter(x), FUN.VALUE = numeric(1))
# Graph avg path length
natl_stats$avg_path_strict <- vapply(strict_groups, function(x) average.path.length(x), FUN.VALUE = numeric(1))
natl_stats$avg_path_proximal <- vapply(proximal_groups, function(x) average.path.length(x), FUN.VALUE = numeric(1))
# External-Internal Index
# Function for calculating national-level EII for each nationality present
calc_nat_ei <- function(nat) {
nat_artists <- national_artists(nat)
non_nat_artists <- non_national_artists(nat)
# Get all vertices which have the same attribute values as val
internal <- edges %>% filter(source %in% nat_artists & target %in% nat_artists) %>% select(source, target) %>% distinct()
# Get all vertices which have different attribute values than val
external <- edges %>% filter(source %in% nat_artists & target %in% non_nat_artists) %>% select(source, target) %>% distinct()
# Count internal vs. external connecitons
i <- nrow(internal)
e <- nrow(external)
return((e - i)/(e + i))
}
natl_stats$eii <- vapply(present_nationalities, function(x) calc_nat_ei(x), FUN.VALUE = numeric(1))
# Join all statistics by nationality and add the year
natl_stats %>% inner_join(natl_works, by = "nationality") %>% mutate(year = index)
}
comb_ei <- function(graph, edges, nodes, index) {
# Find all nationalities present within the graph, and create a data frame with all their combinations
present_nationalities <- (bm_node_nationalities %>% filter(id %in% nodes$id) %>% select(nationality) %>% distinct())$nationality
nat_combos <- expand.grid(present_nationalities, present_nationalities) %>% rename(nat1 = Var1, nat2 = Var2)
# Subset the nationality list for faster searching
sub_nationalities <- bm_node_nationalities %>% filter(id %in% nodes$id)
calc_combo_ei <- function(nat1, nat2) {
# Find all nodes belonging to nat1
nat1_nodes <- sub_nationalities %>% filter(nationality == nat1)
nat2_nodes <- sub_nationalities %>% filter(nationality == nat2)
# Find all edges originating in nodes belonging to nat1
sub_edges <- edges %>% filter(source %in% nat1_nodes$id)
# Find ties from nat1 to nat1
internal <- sub_edges %>% filter(target %in% nat1_nodes$id)
# Find ties from nat1 to ALL external nodes
external <- sub_edges %>% filter(!(target %in% nat1_nodes$id))
# Find ties from nat1 to nat2
spec_external <- sub_edges %>% filter(target %in% nat2_nodes$id)
# Count internal vs. external connecitons
return(data_frame(internal = nrow(internal), external = nrow(external), spec_external = nrow(spec_external)))
}
# Merge the dataframes of every nationality combo into one dataframe, and
# attach it to the nat_combos frame, adding a column for the year
combo_frame <- mapply(function(x, y) calc_combo_ei(x, y), nat_combos$nat1, nat_combos$nat2, SIMPLIFY = FALSE) %>% bind_rows()
nat_combos %>% bind_cols(combo_frame) %>% mutate(year = index)
}
individual_stats <- function(graph, edges, nodes, index) {
names = V(graph)$name
realnames = V(graph)$realname
sub_ind_trends <- data_frame(id = names, realname = realnames, year = index)
# Calculate vectorized statistics
sub_ind_trends$total_degree <- degree(graph, mode = "total", normalized = FALSE)
sub_ind_trends$total_degree_norm <- degree(graph, mode = "total", normalized = TRUE)
sub_ind_trends$cent_betweenness <- betweenness(graph, directed = FALSE, normalized = TRUE)
sub_ind_trends$cent_closeness <- closeness(graph, mode = "total", normalized = TRUE)
sub_ind_trends$coreness <- coreness(graph, mode = "in")
sub_ind_trends$local_cc <- transitivity(graph, type = "local")
#sub_ind_trends$eigen <- centralization.evcent(graph, normalized = TRUE, directed = TRUE)$vector
# Create a smaller reference list so individual node lookup is faster
sub_nationalities <- nodes %>% inner_join(bm_node_nationalities, by = "id")
# Get the individial group-external/group-internal index for all nodes
calc_ind_ei <- function(node_id) {
# Determine the nationalities of the given artist
individual_nat <- sub_nationalities$nationality[sub_nationalities$id == node_id]
# Find all edges for which the artist is a source
ego_edges <- edges %>% filter(source == node_id)
# Find artists who share that nationality
shared_nat <- sub_nationalities %>% filter(nationality %in% individual_nat)
if(nrow(ego_edges) == 0)
return(NA)
# Find all edges internal to given artist's nationalities
internal_edges <- ego_edges %>% filter(target %in% shared_nat$id) %>% select(source, target) %>% distinct()
# Find all edges external to given artist's nationalities
external_edges <- ego_edges %>% filter(!(target %in% shared_nat$id)) %>% select(source, target) %>% distinct()
# Calculate the group-external/group-internal index for this individual node
i <- nrow(internal_edges)
e <- nrow(external_edges)
return((e - i)/(e + i))
}
sub_ind_trends$eii <- vapply(names, FUN.VALUE = numeric(1), function(x) calc_ind_ei(x))
return(sub_ind_trends)
}
## Edge Centralities ----
edge_attributes <- function(graph, edges, nodes, index) {
E(graph)$bet_cent <- edge_betweenness(graph, directed = FALSE)
edf <- igraph::as_data_frame(graph, what = "edges")
edf <- edf %>%
rename(source = from, target = to) %>%
mutate(year = index)
return(edf)
}
library(foreach)
library(doParallel)
# Adjust based on number of cores
registerDoParallel(4)
all_analyses <- function(subgraphs) {
print("Global trends...")
global_trends <- (foreach(i = subgraphs) %dopar% global_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()
print("National trends...")
nat_trends <- (foreach(i = subgraphs) %dopar% national_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()
#print("Binational EI Index...")
#nat_combos <- (foreach(i = subgraphs) %dopar% comb_ei(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows()
print("Individual Trends...")
ind_trends <- (foreach(i = subgraphs) %dopar% individual_stats(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows() %>%
# Allow more easily searchable graphs by adding nationality information to the
# individual trends (this is important for finding the top most central
# artists of a given nationality, for example)
left_join(bm_print_nodes %>% select(id, birth, death), by = "id") %>%
left_join(bm_node_nationalities, by = "id")
print("Edge trends...")
edge_trends <- (foreach(i = subgraphs) %dopar% edge_attributes(i$graph, i$edges, i$nodes, i$index)) %>% bind_rows() %>%
left_join(bm_print_edges, by = c("source", "target")) %>%
inner_join(bm_print_nodes %>% select(id, source_name = realname, source_birth = birth, source_death = death), by = c("source" = "id")) %>%
inner_join(bm_print_nodes %>% select(id, target_name = realname, target_birth = birth, target_death = death), by = c("target" = "id")) %>%
left_join(bm_node_nationalities %>% rename(source_nat = nationality), by = c("source" = "id")) %>%
left_join(bm_node_nationalities %>% rename(target_nat = nationality), by = c("target" = "id"))
return(list(
global_trends = global_trends,
nat_trends = nat_trends,
#nat_combos = nat_combos,
ind_trends = ind_trends,
edge_trends = edge_trends
))
}
#### Run all types of network analyses ####
## VERY COMPUTATIONALLY INTENSIVE - CAN TAKE SEVERAL HOURS TO RUN EVERY TYPE ##
# bm_APrPu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed", "Published"), alive_only = FALSE) %>% all_analyses()
# save(bm_APrPu_all, file = "data/bm_APrPu_all.RData")
bm_APrPu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed", "Published"), alive_only = TRUE) %>% all_analyses()
save(bm_APrPu_alive, file = "data/bm_APrPu_alive.RData")
# bm_PrPu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("Printed", "Published"), alive_only = FALSE) %>% all_analyses()
# save(bm_PrPu_all, file = "data/bm_PrPu_all.RData")
bm_PrPu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("Printed", "Published"), alive_only = TRUE) %>% all_analyses()
save(bm_PrPu_alive, file = "data/bm_PrPu_alive.RData")
# bm_APr_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed"), alive_only = FALSE) %>% all_analyses()
# save(bm_APr_all, file = "data/bm_APr_all.RData")
bm_APr_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Printed"), alive_only = TRUE) %>% all_analyses()
save(bm_APr_alive, file = "data/bm_APr_alive.RData")
# bm_APu_all <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Published"), alive_only = FALSE) %>% all_analyses()
# save(bm_APu_all, file = "data/bm_APu_all.RData")
bm_APu_alive <- all_subgraphs(edges = bm_print_edges, nodes = bm_print_nodes, role_types = c("After", "Published"), alive_only = TRUE) %>% all_analyses()
save(bm_APu_alive, file = "data/bm_APu_alive.RData")
library(lincolnplots)
library(lincolndissdata)
library(dplyr)
library(tidyr)
library(ggplot2)
rkm_flem_stats <- rkm_APrPu_alive$nat_trends %>% filter(nationality == "Zuid-Nederlands") %>% mutate(nationality = "Flemish")
bm_flem_stats <- bm_APrPu_alive$nat_trends %>% filter(nationality == "Flemish")
all_stats <- bind_rows(
"BM" = bm_flem_stats,
"RKM" = rkm_flem_stats,
.id = "dataset"
)
flem_cent_counts <- all_stats %>%
select(year, dataset, centrality = nat_deg_cent_strict, nodes = artists_strict, edges = edges_strict) %>%
gather(metric, value, centrality, nodes, edges) %>%
ggplot(aes(x = year, y = value, color = dataset)) +
facet_wrap(~ metric, ncol = 1, scales = "free_y") +
geom_line(size = 1) +
theme_bw() +
scale_color_grey() +
theme(legend.position = "bottom") +
labs(x = NULL, y = NULL) +
xlim(1500, 1750)
pdfplot(flem_cent_counts, "flem_cent_counts", width = 6, height = 7)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment