Created
December 15, 2017 13:38
-
-
Save giocomai/b15fd8125b19a8dff999a87c8184554c to your computer and use it in GitHub Desktop.
Exploring Putin’s annual news conference 2017
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Exploring Putin's annual news conference" | |
author: "Giorgio Comai" | |
date: '2017-12-15' | |
--- | |
```{r setup, include=FALSE, echo=TRUE, message=FALSE} | |
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE) | |
if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading | |
pacman::p_load("tidyverse") # data processing | |
pacman::p_load("reshape2") # data processing | |
pacman::p_load("rvest") # extracting from html | |
pacman::p_load("tidytext") # transforming text into tidy data frames | |
pacman::p_load("DT") # interactive table | |
pacman::p_load("knitr") # rendering this document | |
pacman::p_load("wordcloud") # you guessed it | |
``` | |
## Downloading the speeches | |
The full transcript of the Russian president's annual news conference is available on [the official website of the Kremlin](http://en.kremlin.ru/), translated in English. Here are the direct links to each of the speeches: | |
```{r} | |
# introduce links to speeches | |
links <- tribble(~Year, ~President, ~Link, | |
2017, "Putin", "http://en.kremlin.ru/events/president/news/56378", | |
2016, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/53573", | |
2015, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/50971", | |
2014, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/47250", | |
2013, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/19859", | |
2012, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/17173" | |
) | |
# store html files locally | |
dir.create(path = file.path("static", "putin-presser"), showWarnings = FALSE) | |
dir.create(path = file.path("static", "putin-presser", "html"), showWarnings = FALSE) | |
# add local link to data.frame | |
links <- dplyr::bind_cols(links, data_frame(LocalLink = file.path("static", "putin-presser", "html", paste0(links$Year, "-", links$President, ".html")))) | |
for (i in seq_along(along.with = links$Link)) { | |
if (file.exists(links$LocalLink[i]) == FALSE) { # download files only if not previously downloaded | |
download.file(url = links$Link[i], destfile = links$LocalLink[i]) | |
Sys.sleep(time = 1) | |
} | |
} | |
knitr::kable(links %>% select("Year", "President", "Link")) | |
``` | |
```{r} | |
# extract speeches | |
text <- purrr::map_chr(.x = links$LocalLink, .f = function(x) read_html(x) %>% rvest::html_nodes(xpath = "//div[@class='read__content']") %>% html_text()) | |
``` | |
```{r} | |
text <- gsub(pattern = "issues", replacement = "issue", x = text) | |
text <- gsub(pattern = "citizens", replacement = "citizen", x = text) | |
text <- gsub(pattern = "it’s", replacement = "", x = text) | |
text <- gsub(pattern = "don't", replacement = "", x = text) | |
text <- gsub(pattern = "don’t", replacement = "", x = text) | |
text <- gsub(pattern = "I’m", replacement = "", x = text) | |
text <- gsub(pattern = "Vladimir", replacement = "", x = text) | |
text <- gsub(pattern = "Putin", replacement = "", x = text) | |
text <- gsub(pattern = "question", replacement = "", x = text) | |
text <- gsub(pattern = "Question", replacement = "", x = text) | |
speechesOriginal <- data_frame(id = paste(links$President, links$Year), txt = text) | |
speechesOriginal_byWord <- speechesOriginal %>% unnest_tokens(word, txt, to_lower = TRUE) | |
speeches <- speechesOriginal %>% | |
unnest_tokens(word, txt, to_lower = TRUE) %>% | |
anti_join(stop_words) # removing a standard list of stopwords | |
speeches_by_speech <- speeches %>% | |
anti_join(stop_words) %>% # removes stop words | |
group_by(id, word) %>% # makes sure wordcount is calculated on a per speech basis | |
count(word, sort = TRUE) %>% # counts words | |
group_by(word) %>% | |
mutate(total = (sum(n))) %>% # counts total number of occurrences of every word in all speeches | |
arrange(total) %>% # orders data by total number of occurrences | |
ungroup() | |
saveRDS(object = speeches_by_speech, file.path("static", "putin-presser", "speeches_by_speech.rds")) | |
## this exports the documents as extracted; useful to check if the text has been imported correctly | |
# writeLines(paste("Speech:", unique(speeches$id), "\n", text, " ___ ______ ______ ______ ______ ______ ______ ______ ______ ___\n __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__\n (______)(______)(______)(______)(______)(______)(______)(______)(______)\n", sep = "\n"), file.path("allSpeeches.txt")) | |
``` | |
## Wordcount per presser | |
```{r numberOfWordsPerPresser} | |
knitr::kable(speechesOriginal_byWord %>% group_by(id) %>% count()) | |
speechesOriginal_byWord %>% group_by(id) %>% count() %>% | |
ggplot(mapping = aes(x = id, y = n)) + | |
geom_col() + | |
scale_x_discrete(name = "") + | |
scale_y_continuous(name = "") + | |
theme_minimal() + | |
labs(title = "Number of words for each of Putin's annual news conference") | |
``` | |
## What are the most frequent keywords across all pressers | |
Here are different ways to look at the words most frequently found in all annual news conference. | |
```{r mostFrequent_barchart_20} | |
mostFrequent_barchart_20 <- | |
speeches_by_speech %>% | |
group_by(word) %>% | |
arrange(desc(total)) %>% | |
group_by(id) %>% | |
top_n(30) %>% | |
arrange(total) %>% | |
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph | |
ggplot(mapping = aes(x = word, y = total, label = n, fill = total)) + | |
geom_col() + | |
scale_x_discrete(name = "") + | |
scale_y_continuous(name = "") + | |
scale_fill_continuous(low = "yellow", high = "red") + | |
coord_flip() + | |
theme_minimal() + | |
labs(title = "Most frequent keywords in Putin's pressers", fill = "Word frequency", caption = "Data processing: Giorgio Comai") | |
mostFrequent_barchart_20 | |
``` | |
```{r mostFrequent_barchart__2017_20} | |
mostFrequent_barchart__2017_20 <- | |
speeches_by_speech %>% | |
filter(id == "Putin 2017") %>% | |
group_by(word) %>% | |
arrange(desc(n)) %>% | |
group_by(id) %>% | |
top_n(30) %>% | |
arrange(n) %>% | |
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph | |
ggplot(mapping = aes(x = word, y = n, label = n, fill = n)) + | |
geom_col() + | |
scale_x_discrete(name = "") + | |
scale_y_continuous(name = "") + | |
scale_fill_continuous(low = "yellow", high = "red") + | |
coord_flip() + | |
theme_minimal() + | |
labs(title = "Most frequent keywords in Putin's 2017 presser", fill = "Word frequency", caption = "Data processing: Giorgio Comai") | |
mostFrequent_barchart__2017_20 | |
``` | |
```{r mostFrequent_barchart__2017_totalcolor_20} | |
mostFrequent_barchart__2017_totalcolor_20 <- | |
speeches_by_speech %>% | |
filter(id == "Putin 2017") %>% | |
group_by(word) %>% | |
arrange(desc(n)) %>% | |
group_by(id) %>% | |
top_n(30) %>% | |
arrange(n) %>% | |
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph | |
ggplot(mapping = aes(x = word, y = n, label = n, fill = total)) + | |
geom_col() + | |
scale_x_discrete(name = "") + | |
scale_y_continuous(name = "") + | |
scale_fill_continuous(low = "yellow", high = "red") + | |
coord_flip() + | |
theme_minimal() + | |
labs(title = "Most frequent keywords in Putin's 2017 annual presser", fill = "Word frequency in\nPutin's pressers", subtitle = "Terms are ordered by their frequency in the 2017 speech, but colour coded according\nto their frequency in all pressers (yellow terms on top are unusually frequent in 2017)", caption = "Data processing: Giorgio Comai") | |
mostFrequent_barchart__2017_totalcolor_20 | |
``` | |
```{r keywords_circles_mostFrequent, fig.width = 8, fig.height = 6} | |
keywords_circles_mostFrequent <- speeches_by_speech %>% | |
group_by(word) %>% | |
arrange(desc(total)) %>% | |
group_by(id) %>% top_n(10) %>% | |
arrange(total) %>% | |
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph | |
ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) + | |
geom_point(shape = 21, stroke = 3, size = 10) + | |
geom_text(mapping = aes(size = 25), colour = "black") + | |
scale_colour_continuous(low = "yellow", high = "red")+ | |
scale_x_discrete(name = "") + | |
scale_y_discrete(name = "") + | |
coord_flip() + | |
theme_minimal() + | |
theme(legend.position="none") + | |
labs(title = "Most frequent keywords in Putin's pressers", caption = "Data processing: Giorgio Comai") | |
keywords_circles_mostFrequent | |
``` | |
## Wordclouds | |
```{r wordcloud_all_pressers, fig.height=10, fig.width=10} | |
speeches %>% | |
anti_join(stop_words) %>% | |
count(word) %>% | |
with(wordcloud(word, n, max.words = 150, | |
random.order = FALSE, | |
random.color = FALSE, | |
colors = TRUE, vfont=c("serif","plain"))) | |
``` | |
## What are the most distinctive words of each presser? | |
The following wordcloud compares the frequency of words across speeches, showing the words that are most frequent in each of the speech and are at the same time less frequently found in other speeches. | |
```{r wordcloud_all_pressers_byspeech_ss, fig.height=13, fig.width=12} | |
speeches %>% | |
anti_join(stop_words) %>% | |
group_by(id) %>% | |
count(word) %>% | |
arrange(desc(id)) %>% | |
spread(id, n) %>% | |
select(word, `Putin 2012`, `Putin 2017`, `Putin 2016`, `Putin 2015`, `Putin 2014`, `Putin 2013`) %>% | |
remove_rownames() %>% | |
column_to_rownames("word") %>% | |
replace(is.na(.), 0) %>% | |
comparison.cloud(max.words = 300, colors=brewer.pal(6, "Dark2"), | |
vfont=c("sans serif","plain")) | |
``` | |
## Selected keywords | |
```{r keywords_circles} | |
keywords_circles <- speeches_by_speech %>% | |
filter(word == "crimea" | word == "ukraine" |word == "europe" | word == "motherland"| word == "syria" | word == "citizens" | word == "jobs" | word == "growth" ) %>% | |
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph | |
ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) + | |
geom_point(shape = 21, stroke = 3, size = 10) + | |
geom_text(mapping = aes(size = 25), colour = "black") + | |
scale_colour_continuous(low = "yellow", high = "red")+ | |
scale_x_discrete(name = "") + | |
scale_y_discrete(name = "") + | |
coord_flip() + | |
theme_minimal() + | |
theme(legend.position="none") + | |
labs(title = "Frequency of selected keywords in Putin's annual pressers") | |
keywords_circles | |
``` | |
## Words with positive/negative connotation | |
The following wordclouds highlight the frequency of words which have a positive and negative connotation according to the lexicon by [Bing Liu and collaborators](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html). | |
### All pressers | |
```{r wordcloud_allPressers_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
### Putin 2012 | |
```{r wordcloud_Putin2012_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
filter(id == "Putin 2012") %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
### Putin 2013 | |
```{r wordcloud_Putin2013_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
filter(id == "Putin 2013") %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
### Putin 2014 | |
```{r wordcloud_Putin2014_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
filter(id == "Putin 2014") %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
### Putin 2015 | |
```{r wordcloud_Putin2015_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
filter(id == "Putin 2015") %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
### Putin 2016 | |
```{r wordcloud_putin2016_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
filter(id == "Putin 2016") %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
### Putin 2017 | |
```{r wordcloud_putin2017_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE} | |
speeches %>% | |
filter(id == "Putin 2016") %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
acast(word ~ sentiment, value.var = "n", fill = 0) %>% | |
comparison.cloud(colors = c("#F8766D", "#00BFC4"), | |
max.words = 100, vfont=c("serif","plain")) | |
``` | |
## What about the "people" | |
This wordcloud is based on all words included in the same sentence as "people". | |
```{r wordcloud_crisis} | |
speeches_by_sentence <- bind_cols(id = paste(links$President, links$Year), links, txt = gsub(pattern = ".", replacement = ". ", x = text, fixed = TRUE)) %>% unnest_tokens(sentence, txt, to_lower = TRUE, token = "sentences") | |
# storing for next post | |
saveRDS(object = speeches_by_sentence, file.path("static", "putin-presser", "speeches_by_sentence.rds")) | |
speeches_by_sentence %>% | |
filter(stringr::str_detect(string = sentence, pattern = "people")) %>% | |
unnest_tokens(input = sentence, output = word) %>% | |
anti_join(stop_words) %>% | |
count(word) %>% with(wordcloud(word, n, scale = c(3, 1), max.words = 50, | |
random.order = FALSE, vfont=c("sans serif","plain"))) | |
DT::datatable(speeches_by_sentence %>% | |
filter(stringr::str_detect(string = sentence, pattern = "people")) %>% | |
select(Year, sentence)%>% | |
arrange(Year) %>% | |
mutate(sentence = stringr::str_replace_all(pattern = stringr::regex("people", ignore_case = TRUE), | |
replacement = paste0("<span style='background-color: #FFFF00'>", "people", "</span>"), | |
string = sentence)), | |
escape = FALSE, | |
options = list(pageLength = 5, lengthMenu = c(3, 5, 10, 15, 20))) | |
``` | |
## About the author | |
Giorgio Comai is a researcher at [OBC Transeuropa](https://twitter.com/BalkansCaucasus), crunching data with [EdjNet](https://twitter.com/EdjNet), previously at Dublin City University. He is | |
[on Twitter](https://twitter.com/giocomai). His website is https://giorgiocomai.eu/. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment