Skip to content

Instantly share code, notes, and snippets.

@juliasilge
Created November 3, 2022 20:41
Show Gist options
  • Save juliasilge/2708e8b4c9f20a3308afe9101c06ab4d to your computer and use it in GitHub Desktop.
Save juliasilge/2708e8b4c9f20a3308afe9101c06ab4d to your computer and use it in GitHub Desktop.
#TidyTuesday horror movie 😱 descriptions
library(tidyverse)
library(tidytext)
library(ggraph)
library(igraph)
#> 
#> Attaching package: 'igraph'
#> The following objects are masked from 'package:dplyr':
#> 
#>     as_data_frame, groups, union
#> The following objects are masked from 'package:purrr':
#> 
#>     compose, simplify
#> The following object is masked from 'package:tidyr':
#> 
#>     crossing
#> The following object is masked from 'package:tibble':
#> 
#>     as_data_frame
#> The following objects are masked from 'package:stats':
#> 
#>     decompose, spectrum
#> The following object is masked from 'package:base':
#> 
#>     union
library(widyr)

horror_movies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
#> Rows: 32540 Columns: 20
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (10): original_title, title, original_language, overview, tagline, post...
#> dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
#> lgl   (1): adult
#> date  (1): release_date
#> 
#> β„Ή Use `spec()` to retrieve the full column specification for this data.
#> β„Ή Specify the column types or set `show_col_types = FALSE` to quiet this message.

horror_tidy <- 
    horror_movies %>%
    unnest_tokens(word, overview) %>%
    anti_join(get_stopwords())
#> Joining, by = "word"

words_cooccur <- 
    horror_tidy %>%
    group_by(word) %>%
    filter(n() > 300) %>%
    ungroup() %>%
    pairwise_cor(word, id, sort = TRUE, upper = FALSE) %>%
    filter(correlation > 0.15)
    
word_counts <- 
    horror_tidy %>%
    count(word, sort = TRUE) %>%
    filter(word %in% words_cooccur$item1 | word %in% words_cooccur$item2)

set.seed(2021)
words_cooccur %>%
    graph_from_data_frame(vertices = word_counts) %>%
    ggraph(layout = "nicely") +
    geom_edge_link(aes(edge_alpha = correlation),
                   edge_width = 0.5, show.legend = FALSE) +
    geom_node_point(aes(size = n), color = "midnightblue", alpha = 0.8) +
    geom_node_text(aes(label = name), repel = TRUE, family = "IBMPlexSans") +
    theme_graph(base_family = "IBMPlexSans-Bold") +
    theme(legend.position = "none") +
    labs(title = "What words are used to describe horror movies?",
         subtitle = "Common words that are highly correlated in #TidyTuesday horror movie overviews")

Created on 2022-11-03 with reprex v2.0.2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment