wizzard of oz sentiment
if (!require("pacman")) install.packages("pacman") | |
pacman::p_load(robotstxt, dplyr, textshape, textclean, textreadr, xml2, rvest, stringi, sentimentr, magrittr, numform) | |
path <- 'http://www.wendyswizardofoz.com/printablescript.htm' | |
paths_allowed(path) | |
abbreviation_map <- list( | |
abb = c('o.s.', 'b.g.'), | |
rep = c('off screen', 'back ground') | |
) | |
dat <- path %>% | |
xml2::read_html() %>% | |
rvest::html_nodes('pre') %>% | |
rvest::html_text() %>% | |
stringi::stri_split_regex('(\n)+') %>% | |
`[[`(1) %>% | |
textshape::split_match_regex('^([A-Z]{3,}\\s+){0,1}([A-Z]{2,}\\b)\\s*-{1,2}', include = TRUE) %>% | |
lapply(function(x){ | |
textshape::split_match_regex(x, "^(\t){4}([A-Z]{3,}\\s+){0,2}([A-Z]{2,}\\b)", include = TRUE) | |
}) %>% | |
unlist(recursive = FALSE) %>% | |
textshape::combine() %>% | |
stringi::stri_replace_all_regex('\t\t\t\t\t\t\t\tFADE OUT:.+$', '') %>% | |
stringi::stri_replace_all_regex('(^[^\t].+$)', '{{$1}}') %>% | |
stringi::stri_replace_all_regex('LAP DISSOLVE TO:', '') %>% | |
textshape::split_match_regex_to_transcript("^(\t){4}([A-Z']{3,}[ ]+){0,2}([A-Z']{2,}\\b)", skip = 15) %>% | |
dplyr::mutate_each(funs(trimws)) %>% | |
dplyr::mutate_at(vars(Dialogue), textclean::mgsub, pattern = abbreviation_map$abb, replacement = abbreviation_map$rep) %>% | |
tibble::as_tibble() %>% | |
mutate( | |
Dialogue = gsub('-{2,}', '... ', Dialogue), | |
Directions = sapply(stringi::stri_extract_all_regex(Dialogue, '\\{{2}.+?\\}{2}'), function(x) { | |
paste(gsub('(\\{{2})|(\\}{2})', '', unlist(x)), collapse = ' ') | |
}) %>% | |
ifelse(. == "NA", NA, .), | |
Dialogue = gsub('\\{{2}.+?\\}{2}', ' ', Dialogue) | |
) %>% | |
textshape::split_speaker(sep = 'AND') %>% | |
dplyr::rename(original_element_id = element_id) %>% | |
dplyr::select(Person, Dialogue) %>% | |
textshape::split_sentence() | |
dat %>% | |
textreadr::peek(n = 20, 20) | |
dat %>% | |
select(Person, element_id) %>% | |
distinct() %>% | |
count(Person) %>% | |
arrange(desc(n)) | |
dat %>% | |
group_by(Person) %>% | |
summarize(n_words = sum(stringi::stri_count_words(Dialogue), na.rm = TRUE)) %>% | |
arrange(desc(n_words)) | |
hash_sentiment_jockers2 <- update_polarity_table( | |
lexicon::hash_sentiment_jockers, | |
x = data.frame(x = c("witch is dead"), y = c(2), stringsAsFactors = FALSE) | |
) | |
senti_dat <- dat %>% | |
dplyr::rename(original_element_id = element_id) %>% | |
sentimentr::get_sentences() %>% | |
sentimentr::sentiment_by(polarity_dt = hash_sentiment_jockers2) | |
plot(senti_dat, low_pass_size = 10) | |
plot(senti_dat, scale_range = TRUE, low_pass_size = 10) | |
mvg <- function(y, win = 4){ | |
len <- length(y) | |
out <- sapply(seq_along(y), function(i){ | |
i <- (i-win):(i+win) | |
mean(y[i[i > 0 & i < len]]) | |
}) | |
class(out) <- c('mvg', class(out)) | |
attributes(out)[['orig']] <- y | |
out | |
} | |
mvg2 <- function(x, s1 =4, s2=2, ...){ | |
mvg(mvg(x, s1), s2) | |
} | |
event_dat <- dat %>% | |
mutate(id = 1:n()) %>% | |
textclean::keep_row( | |
'Dialogue', | |
c( | |
'Are you a good witch, or a bad witch', | |
'That way is a very nice way.', | |
'I\'m melting!', | |
'Go away and come back tomorrow!', | |
'Why, anybody can have a brain.', | |
'Oh, come back here!', | |
'And think to yourself', | |
'Why, it\'s a man!', | |
'What do you think you\'re doing?', | |
'Put \'em up!', | |
'and there they\'ll stay!', | |
'Who killed my sister?', | |
'soothing to the smell!', | |
'Can\'t you read?', | |
'We want to see the Wizard right away', | |
'We\'ll soon find the', | |
'What an unexpected pleasure', | |
'Do you suppose there is such a place', | |
'I want to see you and your wife right', | |
'her face is careworn' | |
) | |
) %>% | |
mutate( | |
event = c( | |
'Somewhere Over the Rainbow', | |
'Miss Gulch Wants Toto', | |
'Professor Looks into Crystal Ball', | |
'Meets Glinda', | |
'Meets Wicked Witch', | |
'Dorthy Aquires Slippers', | |
'Meets Scarecrow', | |
'Angry Apple Trees', | |
'Meets Tin Man', | |
'Meets Lion', | |
'Poppy Field', | |
'Arrival at Emerald City', | |
'Oz Refuses to See Them', | |
'Meets Oz', | |
'Witch Takes Toto', | |
'Wicked Witch Dies', | |
'Oz Breaks Promise', | |
'Oz Keeps Promise', | |
'Toto Runs Away', | |
'Clicks Red Slippers' | |
) %>% numform::f_wrap(12) | |
) %>% | |
left_join( | |
senti_dat %>% | |
mutate( | |
id = 1:n(), | |
rollave = mvg2(ave_sentiment, 8, 10), | |
smoothed = mvg2(rollave, 20, 60) | |
) %>% | |
select(id, rollave), | |
by = 'id' | |
) | |
senti_dat %>% | |
mutate( | |
rollave = mvg2(ave_sentiment, 8, 10), | |
smoothed = mvg2(rollave, 20, 60) | |
) %>% | |
ggplot() + | |
geom_hline(yintercept = 0, color = 'grey50', linetype = 'dashed', size = .65) + | |
geom_line(color = 'grey60', aes( x= element_id, y = rollave), size = .8, linetype = 'dotted') + | |
geom_line(color = 'purple', aes( x= element_id, y = smoothed), size = 1) + | |
geom_point(data = event_dat, aes(x = id, y = rollave), color = 'blue') + | |
ggrepel::geom_text_repel(data = event_dat, aes(x = id, y = rollave, label = event), | |
size = 3, color = 'grey50', alpha = .6) + | |
theme_minimal() + | |
scale_y_continuous(labels = numform::ff_num(digits = 1)) + | |
theme( | |
panel.grid.major.x = element_blank(), | |
panel.grid.minor.x = element_blank(), | |
axis.text.y = element_text(color = 'gray65', size = 11), | |
axis.title.y = element_text(color = 'gray65', size = 11, angle = 0), | |
axis.text.x = element_blank(), | |
axis.ticks = element_blank() | |
) + | |
labs( | |
x = NULL, y = 'Sentiment', | |
title = 'Wizzard of Oz Sentiment', | |
subtitle = numform::f_wrap(c( | |
"The grey dotted line show less smoothed sentiment scores.", | |
'The purple line shows a more smoothed sentiment across the story.' | |
), 140, collapse = TRUE) | |
) + | |
coord_cartesian(xlim = c(0, 2580)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
trinker commentedOct 31, 2017