Skip to content

Instantly share code, notes, and snippets.

@trinker
Last active May 5, 2020 15:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trinker/841d5d7834668357a0330a4c5eebe22b to your computer and use it in GitHub Desktop.
Save trinker/841d5d7834668357a0330a4c5eebe22b to your computer and use it in GitHub Desktop.
wizzard of oz sentiment
if (!require("pacman")) install.packages("pacman")
pacman::p_load(robotstxt, dplyr, textshape, textclean, textreadr, xml2, rvest, stringi, sentimentr, magrittr, numform)
path <- 'http://www.wendyswizardofoz.com/printablescript.htm'
paths_allowed(path)
abbreviation_map <- list(
abb = c('o.s.', 'b.g.'),
rep = c('off screen', 'back ground')
)
dat <- path %>%
xml2::read_html() %>%
rvest::html_nodes('pre') %>%
rvest::html_text() %>%
stringi::stri_split_regex('(\n)+') %>%
`[[`(1) %>%
textshape::split_match_regex('^([A-Z]{3,}\\s+){0,1}([A-Z]{2,}\\b)\\s*-{1,2}', include = TRUE) %>%
lapply(function(x){
textshape::split_match_regex(x, "^(\t){4}([A-Z]{3,}\\s+){0,2}([A-Z]{2,}\\b)", include = TRUE)
}) %>%
unlist(recursive = FALSE) %>%
textshape::combine() %>%
stringi::stri_replace_all_regex('\t\t\t\t\t\t\t\tFADE OUT:.+$', '') %>%
stringi::stri_replace_all_regex('(^[^\t].+$)', '{{$1}}') %>%
stringi::stri_replace_all_regex('LAP DISSOLVE TO:', '') %>%
textshape::split_match_regex_to_transcript("^(\t){4}([A-Z']{3,}[ ]+){0,2}([A-Z']{2,}\\b)", skip = 15) %>%
dplyr::mutate_each(funs(trimws)) %>%
dplyr::mutate_at(vars(Dialogue), textclean::mgsub, pattern = abbreviation_map$abb, replacement = abbreviation_map$rep) %>%
tibble::as_tibble() %>%
mutate(
Dialogue = gsub('-{2,}', '... ', Dialogue),
Directions = sapply(stringi::stri_extract_all_regex(Dialogue, '\\{{2}.+?\\}{2}'), function(x) {
paste(gsub('(\\{{2})|(\\}{2})', '', unlist(x)), collapse = ' ')
}) %>%
ifelse(. == "NA", NA, .),
Dialogue = gsub('\\{{2}.+?\\}{2}', ' ', Dialogue)
) %>%
textshape::split_speaker(sep = 'AND') %>%
dplyr::rename(original_element_id = element_id) %>%
dplyr::select(Person, Dialogue) %>%
textshape::split_sentence()
dat %>%
textreadr::peek(n = 20, 20)
dat %>%
select(Person, element_id) %>%
distinct() %>%
count(Person) %>%
arrange(desc(n))
dat %>%
group_by(Person) %>%
summarize(n_words = sum(stringi::stri_count_words(Dialogue), na.rm = TRUE)) %>%
arrange(desc(n_words))
hash_sentiment_jockers2 <- update_polarity_table(
lexicon::hash_sentiment_jockers,
x = data.frame(x = c("witch is dead"), y = c(2), stringsAsFactors = FALSE)
)
senti_dat <- dat %>%
dplyr::rename(original_element_id = element_id) %>%
sentimentr::get_sentences() %>%
sentimentr::sentiment_by(polarity_dt = hash_sentiment_jockers2)
plot(senti_dat, low_pass_size = 10)
plot(senti_dat, scale_range = TRUE, low_pass_size = 10)
mvg <- function(y, win = 4){
len <- length(y)
out <- sapply(seq_along(y), function(i){
i <- (i-win):(i+win)
mean(y[i[i > 0 & i < len]])
})
class(out) <- c('mvg', class(out))
attributes(out)[['orig']] <- y
out
}
mvg2 <- function(x, s1 =4, s2=2, ...){
mvg(mvg(x, s1), s2)
}
event_dat <- dat %>%
mutate(id = 1:n()) %>%
textclean::keep_row(
'Dialogue',
c(
'Are you a good witch, or a bad witch',
'That way is a very nice way.',
'I\'m melting!',
'Go away and come back tomorrow!',
'Why, anybody can have a brain.',
'Oh, come back here!',
'And think to yourself',
'Why, it\'s a man!',
'What do you think you\'re doing?',
'Put \'em up!',
'and there they\'ll stay!',
'Who killed my sister?',
'soothing to the smell!',
'Can\'t you read?',
'We want to see the Wizard right away',
'We\'ll soon find the',
'What an unexpected pleasure',
'Do you suppose there is such a place',
'I want to see you and your wife right',
'her face is careworn'
)
) %>%
mutate(
event = c(
'Somewhere Over the Rainbow',
'Miss Gulch Wants Toto',
'Professor Looks into Crystal Ball',
'Meets Glinda',
'Meets Wicked Witch',
'Dorthy Aquires Slippers',
'Meets Scarecrow',
'Angry Apple Trees',
'Meets Tin Man',
'Meets Lion',
'Poppy Field',
'Arrival at Emerald City',
'Oz Refuses to See Them',
'Meets Oz',
'Witch Takes Toto',
'Wicked Witch Dies',
'Oz Breaks Promise',
'Oz Keeps Promise',
'Toto Runs Away',
'Clicks Red Slippers'
) %>% numform::f_wrap(12)
) %>%
left_join(
senti_dat %>%
mutate(
id = 1:n(),
rollave = mvg2(ave_sentiment, 8, 10),
smoothed = mvg2(rollave, 20, 60)
) %>%
select(id, rollave),
by = 'id'
)
senti_dat %>%
mutate(
rollave = mvg2(ave_sentiment, 8, 10),
smoothed = mvg2(rollave, 20, 60)
) %>%
ggplot() +
geom_hline(yintercept = 0, color = 'grey50', linetype = 'dashed', size = .65) +
geom_line(color = 'grey60', aes( x= element_id, y = rollave), size = .8, linetype = 'dotted') +
geom_line(color = 'purple', aes( x= element_id, y = smoothed), size = 1) +
geom_point(data = event_dat, aes(x = id, y = rollave), color = 'blue') +
ggrepel::geom_text_repel(data = event_dat, aes(x = id, y = rollave, label = event),
size = 3, color = 'grey50', alpha = .6) +
theme_minimal() +
scale_y_continuous(labels = numform::ff_num(digits = 1)) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.y = element_text(color = 'gray65', size = 11),
axis.title.y = element_text(color = 'gray65', size = 11, angle = 0),
axis.text.x = element_blank(),
axis.ticks = element_blank()
) +
labs(
x = NULL, y = 'Sentiment',
title = 'Wizzard of Oz Sentiment',
subtitle = numform::f_wrap(c(
"The grey dotted line show less smoothed sentiment scores.",
'The purple line shows a more smoothed sentiment across the story.'
), 140, collapse = TRUE)
) +
coord_cartesian(xlim = c(0, 2580))
@trinker
Copy link
Author

trinker commented Oct 31, 2017

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment