Skip to content

Instantly share code, notes, and snippets.

@ofchurches
Last active April 30, 2020 00:41
Show Gist options
  • Save ofchurches/ad7c9309863ffed7b830b8d6f22c1a71 to your computer and use it in GitHub Desktop.
Save ofchurches/ad7c9309863ffed7b830b8d6f22c1a71 to your computer and use it in GitHub Desktop.
tidy_tuesday_20200428
library(tidyverse)
library(tidygraph)
library(ggraph)
grosses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/grosses.csv', guess_max = 40000)
# links between theatres that have put on the same show
# create the edge list
edge_list <- grosses %>%
select(show, theatre) %>%
distinct() %>%
# here the steps to getting an edge list are from https://stackoverflow.com/questions/34670145/generating-an-edge-list-from-id-and-grouping-vectors
group_by(show) %>%
filter(n() >= 2) %>%
do(data.frame(t(combn(.$theatre, 2)), stringsAsFactors = FALSE)) %>%
ungroup() %>%
select(- show) %>%
rename(from = X1, to = X2) %>%
# here the steps to getting the edge weight are from: https://www.jessesadler.com/post/network-analysis-with-r/
group_by(from, to) %>%
summarise(weight = n()) %>%
ungroup() %>%
filter(weight > 1)
# Create graph using tidygraph
graph <- as_tbl_graph(edge_list) %>%
to_undirected() %>%
activate(nodes) %>%
mutate(centrality = centrality_authority()) %>%
mutate(group = as.factor(group_edge_betweenness())) %>%
group_by(group) %>%
mutate(name_first = last(name, order_by = centrality)) %>%
ungroup()
# plot network using ggraph
graph %>%
ggraph(layout = 'linear') +
geom_edge_arc(aes(width = weight),
show.legend = FALSE,
colour = "grey") +
geom_node_text(aes(label = str_wrap(name, width = 30),
colour = group),
angle = 90,
position = position_nudge(y = -.3),
hjust = 1) +
geom_node_point(aes(colour = group),
size = 6) +
ylim(-4, 5) +
theme(plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
title = element_text(colour = "black",
size = 16)) +
guides(colour = FALSE, size = FALSE) +
labs(title = str_wrap("Network of Broadway theaters that have put on the same shows",
width = 60))
ggsave("broadway_network.png",
scale = 2,
width = 90,
height = 90,
units = "mm",
dpi = 300)
#look at the synopses
library(tidytext)
library(ggrepel)
synopses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/synopses.csv')
run_length <- grosses %>%
count(show) %>%
rename(run_weeks = n)
opening_week <- grosses %>%
group_by(show) %>%
mutate(opening = first(week_ending)) %>%
ungroup() %>%
select(show, opening) %>%
distinct()
synopses %>%
unnest_tokens(word, synopsis) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(show) %>%
filter(n() > 5) %>%
summarise(average_sentiment = mean(value)) %>%
ungroup() %>%
filter(average_sentiment > 2.5 | average_sentiment < -.8) %>%
ggplot(aes(x = reorder(show, average_sentiment), y = average_sentiment, fill = average_sentiment > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "The most positive and negative play synopses on Braodway",
y = "Average sentiment of the synopsis",
x = "") +
theme_minimal()
synopses %>%
unnest_tokens(word, synopsis) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(show) %>%
filter(n() > 2) %>%
summarise(average_sentiment = mean(value)) %>%
ungroup() %>%
inner_join(run_length) %>%
inner_join(opening_week) %>%
ggplot(aes(y = run_weeks, x = average_sentiment, label = show)) +
geom_jitter() +
geom_label_repel(aes(label = ifelse(run_weeks > 250, show, NA)))+
labs(title = "Average synopsis sentiment by run length of musicals on Broadway",
x = "Average sentiment",
y = "Run length (weeks)") +
theme_minimal()
ggsave("broadway_sentiment_runs.png",
scale = 2,
width = 90,
height = 90,
units = "mm",
dpi = 300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment