Skip to content

Instantly share code, notes, and snippets.

@andrewheiss
Created May 26, 2020 00:31
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewheiss/75445543e2095d598832d7c948851365 to your computer and use it in GitHub Desktop.
Save andrewheiss/75445543e2095d598832d7c948851365 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(tidytext)
library(gutenbergr)
library(cleanNLP)
little_women_raw <- gutenberg_download(514, meta_fields = "title")
little_women <- little_women_raw %>%
slice(70:n()) %>%
mutate(chapter_start = str_detect(text, "^CHAPTER"),
chapter_number = cumsum(chapter_start)) %>%
select(-gutenberg_id, -title, -chapter_start)
# Structure the data for the POS tagger
little_women_to_tag <- little_women %>%
group_by(chapter_number) %>%
nest(data = c(text)) %>%
ungroup() %>%
mutate(text = map_chr(data, ~paste(.$text, collapse = " "))) %>%
select(-data)
# Use the R-only part-of-speech tagger
cnlp_init_udpipe()
# This took ≈180 seconds to run
little_women_tagged <- cnlp_annotate(little_women_to_tag,
text_name = "text",
doc_name = "chapter_number")
little_women_tokens <- little_women_tagged$token
proper_nouns <- little_women_tokens %>%
filter(upos == "PROPN")
main_characters_by_chapter <- proper_nouns %>%
filter(lemma %in% c("Meg", "Jo", "Beth", "Amy")) %>%
group_by(doc_id, lemma) %>%
summarize(n = n()) %>%
mutate(name = factor(lemma, levels = c("Meg", "Jo", "Beth", "Amy"), ordered = TRUE)) %>%
rename(chapter = doc_id) %>%
group_by(chapter) %>%
mutate(prop = n / sum(n)) %>%
ungroup() %>%
mutate(chapter_name = paste("Chapter", chapter)) %>%
mutate(chapter_name = fct_inorder(chapter_name))
# Meh
ggplot(main_characters_by_chapter, aes(x = name, y = n, fill = name)) +
geom_col() +
facet_wrap(vars(chapter))
# Super meh
ggplot(main_characters_by_chapter, aes(x = name, y = 1, size = n, color = name)) +
geom_point() +
facet_wrap(vars(chapter))
# Awesome
ggplot(main_characters_by_chapter, aes(x = prop, y = 1, fill = fct_rev(name))) +
geom_col(position = position_stack()) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
scale_fill_viridis_d(option = "plasma", end = 0.9, name = NULL) +
guides(fill = guide_legend(reverse = TRUE)) +
labs(x = NULL, y = NULL,
title = "Proportion of mentions of each\nLittle Woman per chapter",
subtitle = "Jo basically dominates the last third of the book") +
facet_wrap(vars(chapter_name), nrow = 6) +
theme_bw(base_family = "Halis GR") +
theme(legend.position = "top",
axis.text = element_blank(),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "white"),
legend.text = element_text(face = "bold", size = rel(1)),
plot.title = element_text(face = "bold", hjust = 0.5, size = rel(1.7)),
plot.subtitle = element_text(hjust = 0.5, size = rel(1.1)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment