Created
May 26, 2020 00:31
-
-
Save andrewheiss/75445543e2095d598832d7c948851365 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tidytext) | |
library(gutenbergr) | |
library(cleanNLP) | |
little_women_raw <- gutenberg_download(514, meta_fields = "title") | |
little_women <- little_women_raw %>% | |
slice(70:n()) %>% | |
mutate(chapter_start = str_detect(text, "^CHAPTER"), | |
chapter_number = cumsum(chapter_start)) %>% | |
select(-gutenberg_id, -title, -chapter_start) | |
# Structure the data for the POS tagger | |
little_women_to_tag <- little_women %>% | |
group_by(chapter_number) %>% | |
nest(data = c(text)) %>% | |
ungroup() %>% | |
mutate(text = map_chr(data, ~paste(.$text, collapse = " "))) %>% | |
select(-data) | |
# Use the R-only part-of-speech tagger | |
cnlp_init_udpipe() | |
# This took ≈180 seconds to run | |
little_women_tagged <- cnlp_annotate(little_women_to_tag, | |
text_name = "text", | |
doc_name = "chapter_number") | |
little_women_tokens <- little_women_tagged$token | |
proper_nouns <- little_women_tokens %>% | |
filter(upos == "PROPN") | |
main_characters_by_chapter <- proper_nouns %>% | |
filter(lemma %in% c("Meg", "Jo", "Beth", "Amy")) %>% | |
group_by(doc_id, lemma) %>% | |
summarize(n = n()) %>% | |
mutate(name = factor(lemma, levels = c("Meg", "Jo", "Beth", "Amy"), ordered = TRUE)) %>% | |
rename(chapter = doc_id) %>% | |
group_by(chapter) %>% | |
mutate(prop = n / sum(n)) %>% | |
ungroup() %>% | |
mutate(chapter_name = paste("Chapter", chapter)) %>% | |
mutate(chapter_name = fct_inorder(chapter_name)) | |
# Meh | |
ggplot(main_characters_by_chapter, aes(x = name, y = n, fill = name)) + | |
geom_col() + | |
facet_wrap(vars(chapter)) | |
# Super meh | |
ggplot(main_characters_by_chapter, aes(x = name, y = 1, size = n, color = name)) + | |
geom_point() + | |
facet_wrap(vars(chapter)) | |
# Awesome | |
ggplot(main_characters_by_chapter, aes(x = prop, y = 1, fill = fct_rev(name))) + | |
geom_col(position = position_stack()) + | |
scale_x_continuous(expand = c(0, 0)) + | |
scale_y_continuous(expand = c(0, 0)) + | |
scale_fill_viridis_d(option = "plasma", end = 0.9, name = NULL) + | |
guides(fill = guide_legend(reverse = TRUE)) + | |
labs(x = NULL, y = NULL, | |
title = "Proportion of mentions of each\nLittle Woman per chapter", | |
subtitle = "Jo basically dominates the last third of the book") + | |
facet_wrap(vars(chapter_name), nrow = 6) + | |
theme_bw(base_family = "Halis GR") + | |
theme(legend.position = "top", | |
axis.text = element_blank(), | |
axis.ticks = element_blank(), | |
strip.background = element_rect(fill = "white"), | |
legend.text = element_text(face = "bold", size = rel(1)), | |
plot.title = element_text(face = "bold", hjust = 0.5, size = rel(1.7)), | |
plot.subtitle = element_text(hjust = 0.5, size = rel(1.1))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment