Last active
May 5, 2020 15:06
-
-
Save trinker/841d5d7834668357a0330a4c5eebe22b to your computer and use it in GitHub Desktop.
wizzard of oz sentiment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if (!require("pacman")) install.packages("pacman") | |
pacman::p_load(robotstxt, dplyr, textshape, textclean, textreadr, xml2, rvest, stringi, sentimentr, magrittr, numform) | |
path <- 'http://www.wendyswizardofoz.com/printablescript.htm' | |
paths_allowed(path) | |
abbreviation_map <- list( | |
abb = c('o.s.', 'b.g.'), | |
rep = c('off screen', 'back ground') | |
) | |
dat <- path %>% | |
xml2::read_html() %>% | |
rvest::html_nodes('pre') %>% | |
rvest::html_text() %>% | |
stringi::stri_split_regex('(\n)+') %>% | |
`[[`(1) %>% | |
textshape::split_match_regex('^([A-Z]{3,}\\s+){0,1}([A-Z]{2,}\\b)\\s*-{1,2}', include = TRUE) %>% | |
lapply(function(x){ | |
textshape::split_match_regex(x, "^(\t){4}([A-Z]{3,}\\s+){0,2}([A-Z]{2,}\\b)", include = TRUE) | |
}) %>% | |
unlist(recursive = FALSE) %>% | |
textshape::combine() %>% | |
stringi::stri_replace_all_regex('\t\t\t\t\t\t\t\tFADE OUT:.+$', '') %>% | |
stringi::stri_replace_all_regex('(^[^\t].+$)', '{{$1}}') %>% | |
stringi::stri_replace_all_regex('LAP DISSOLVE TO:', '') %>% | |
textshape::split_match_regex_to_transcript("^(\t){4}([A-Z']{3,}[ ]+){0,2}([A-Z']{2,}\\b)", skip = 15) %>% | |
dplyr::mutate_each(funs(trimws)) %>% | |
dplyr::mutate_at(vars(Dialogue), textclean::mgsub, pattern = abbreviation_map$abb, replacement = abbreviation_map$rep) %>% | |
tibble::as_tibble() %>% | |
mutate( | |
Dialogue = gsub('-{2,}', '... ', Dialogue), | |
Directions = sapply(stringi::stri_extract_all_regex(Dialogue, '\\{{2}.+?\\}{2}'), function(x) { | |
paste(gsub('(\\{{2})|(\\}{2})', '', unlist(x)), collapse = ' ') | |
}) %>% | |
ifelse(. == "NA", NA, .), | |
Dialogue = gsub('\\{{2}.+?\\}{2}', ' ', Dialogue) | |
) %>% | |
textshape::split_speaker(sep = 'AND') %>% | |
dplyr::rename(original_element_id = element_id) %>% | |
dplyr::select(Person, Dialogue) %>% | |
textshape::split_sentence() | |
dat %>% | |
textreadr::peek(n = 20, 20) | |
dat %>% | |
select(Person, element_id) %>% | |
distinct() %>% | |
count(Person) %>% | |
arrange(desc(n)) | |
dat %>% | |
group_by(Person) %>% | |
summarize(n_words = sum(stringi::stri_count_words(Dialogue), na.rm = TRUE)) %>% | |
arrange(desc(n_words)) | |
hash_sentiment_jockers2 <- update_polarity_table( | |
lexicon::hash_sentiment_jockers, | |
x = data.frame(x = c("witch is dead"), y = c(2), stringsAsFactors = FALSE) | |
) | |
senti_dat <- dat %>% | |
dplyr::rename(original_element_id = element_id) %>% | |
sentimentr::get_sentences() %>% | |
sentimentr::sentiment_by(polarity_dt = hash_sentiment_jockers2) | |
plot(senti_dat, low_pass_size = 10) | |
plot(senti_dat, scale_range = TRUE, low_pass_size = 10) | |
mvg <- function(y, win = 4){ | |
len <- length(y) | |
out <- sapply(seq_along(y), function(i){ | |
i <- (i-win):(i+win) | |
mean(y[i[i > 0 & i < len]]) | |
}) | |
class(out) <- c('mvg', class(out)) | |
attributes(out)[['orig']] <- y | |
out | |
} | |
mvg2 <- function(x, s1 =4, s2=2, ...){ | |
mvg(mvg(x, s1), s2) | |
} | |
event_dat <- dat %>% | |
mutate(id = 1:n()) %>% | |
textclean::keep_row( | |
'Dialogue', | |
c( | |
'Are you a good witch, or a bad witch', | |
'That way is a very nice way.', | |
'I\'m melting!', | |
'Go away and come back tomorrow!', | |
'Why, anybody can have a brain.', | |
'Oh, come back here!', | |
'And think to yourself', | |
'Why, it\'s a man!', | |
'What do you think you\'re doing?', | |
'Put \'em up!', | |
'and there they\'ll stay!', | |
'Who killed my sister?', | |
'soothing to the smell!', | |
'Can\'t you read?', | |
'We want to see the Wizard right away', | |
'We\'ll soon find the', | |
'What an unexpected pleasure', | |
'Do you suppose there is such a place', | |
'I want to see you and your wife right', | |
'her face is careworn' | |
) | |
) %>% | |
mutate( | |
event = c( | |
'Somewhere Over the Rainbow', | |
'Miss Gulch Wants Toto', | |
'Professor Looks into Crystal Ball', | |
'Meets Glinda', | |
'Meets Wicked Witch', | |
'Dorthy Aquires Slippers', | |
'Meets Scarecrow', | |
'Angry Apple Trees', | |
'Meets Tin Man', | |
'Meets Lion', | |
'Poppy Field', | |
'Arrival at Emerald City', | |
'Oz Refuses to See Them', | |
'Meets Oz', | |
'Witch Takes Toto', | |
'Wicked Witch Dies', | |
'Oz Breaks Promise', | |
'Oz Keeps Promise', | |
'Toto Runs Away', | |
'Clicks Red Slippers' | |
) %>% numform::f_wrap(12) | |
) %>% | |
left_join( | |
senti_dat %>% | |
mutate( | |
id = 1:n(), | |
rollave = mvg2(ave_sentiment, 8, 10), | |
smoothed = mvg2(rollave, 20, 60) | |
) %>% | |
select(id, rollave), | |
by = 'id' | |
) | |
senti_dat %>% | |
mutate( | |
rollave = mvg2(ave_sentiment, 8, 10), | |
smoothed = mvg2(rollave, 20, 60) | |
) %>% | |
ggplot() + | |
geom_hline(yintercept = 0, color = 'grey50', linetype = 'dashed', size = .65) + | |
geom_line(color = 'grey60', aes( x= element_id, y = rollave), size = .8, linetype = 'dotted') + | |
geom_line(color = 'purple', aes( x= element_id, y = smoothed), size = 1) + | |
geom_point(data = event_dat, aes(x = id, y = rollave), color = 'blue') + | |
ggrepel::geom_text_repel(data = event_dat, aes(x = id, y = rollave, label = event), | |
size = 3, color = 'grey50', alpha = .6) + | |
theme_minimal() + | |
scale_y_continuous(labels = numform::ff_num(digits = 1)) + | |
theme( | |
panel.grid.major.x = element_blank(), | |
panel.grid.minor.x = element_blank(), | |
axis.text.y = element_text(color = 'gray65', size = 11), | |
axis.title.y = element_text(color = 'gray65', size = 11, angle = 0), | |
axis.text.x = element_blank(), | |
axis.ticks = element_blank() | |
) + | |
labs( | |
x = NULL, y = 'Sentiment', | |
title = 'Wizzard of Oz Sentiment', | |
subtitle = numform::f_wrap(c( | |
"The grey dotted line show less smoothed sentiment scores.", | |
'The purple line shows a more smoothed sentiment across the story.' | |
), 140, collapse = TRUE) | |
) + | |
coord_cartesian(xlim = c(0, 2580)) | |
Author
trinker
commented
Oct 31, 2017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment