Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giocomai/b15fd8125b19a8dff999a87c8184554c to your computer and use it in GitHub Desktop.
Save giocomai/b15fd8125b19a8dff999a87c8184554c to your computer and use it in GitHub Desktop.
Exploring Putin’s annual news conference 2017
title: "Exploring Putin's annual news conference"
author: "Giorgio Comai"
date: '2017-12-15'
```{r setup, include=FALSE, echo=TRUE, message=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading
pacman::p_load("tidyverse") # data processing
pacman::p_load("reshape2") # data processing
pacman::p_load("rvest") # extracting from html
pacman::p_load("tidytext") # transforming text into tidy data frames
pacman::p_load("DT") # interactive table
pacman::p_load("knitr") # rendering this document
pacman::p_load("wordcloud") # you guessed it
## Downloading the speeches
The full transcript of the Russian president's annual news conference is available on [the official website of the Kremlin](, translated in English. Here are the direct links to each of the speeches:
# introduce links to speeches
links <- tribble(~Year, ~President, ~Link,
2017, "Putin", "",
2016, "Putin", "",
2015, "Putin", "",
2014, "Putin", "",
2013, "Putin", "",
2012, "Putin", ""
# store html files locally
dir.create(path = file.path("static", "putin-presser"), showWarnings = FALSE)
dir.create(path = file.path("static", "putin-presser", "html"), showWarnings = FALSE)
# add local link to data.frame
links <- dplyr::bind_cols(links, data_frame(LocalLink = file.path("static", "putin-presser", "html", paste0(links$Year, "-", links$President, ".html"))))
for (i in seq_along(along.with = links$Link)) {
if (file.exists(links$LocalLink[i]) == FALSE) { # download files only if not previously downloaded
download.file(url = links$Link[i], destfile = links$LocalLink[i])
Sys.sleep(time = 1)
knitr::kable(links %>% select("Year", "President", "Link"))
# extract speeches
text <- purrr::map_chr(.x = links$LocalLink, .f = function(x) read_html(x) %>% rvest::html_nodes(xpath = "//div[@class='read__content']") %>% html_text())
text <- gsub(pattern = "issues", replacement = "issue", x = text)
text <- gsub(pattern = "citizens", replacement = "citizen", x = text)
text <- gsub(pattern = "it’s", replacement = "", x = text)
text <- gsub(pattern = "don't", replacement = "", x = text)
text <- gsub(pattern = "don’t", replacement = "", x = text)
text <- gsub(pattern = "I’m", replacement = "", x = text)
text <- gsub(pattern = "Vladimir", replacement = "", x = text)
text <- gsub(pattern = "Putin", replacement = "", x = text)
text <- gsub(pattern = "question", replacement = "", x = text)
text <- gsub(pattern = "Question", replacement = "", x = text)
speechesOriginal <- data_frame(id = paste(links$President, links$Year), txt = text)
speechesOriginal_byWord <- speechesOriginal %>% unnest_tokens(word, txt, to_lower = TRUE)
speeches <- speechesOriginal %>%
unnest_tokens(word, txt, to_lower = TRUE) %>%
anti_join(stop_words) # removing a standard list of stopwords
speeches_by_speech <- speeches %>%
anti_join(stop_words) %>% # removes stop words
group_by(id, word) %>% # makes sure wordcount is calculated on a per speech basis
count(word, sort = TRUE) %>% # counts words
group_by(word) %>%
mutate(total = (sum(n))) %>% # counts total number of occurrences of every word in all speeches
arrange(total) %>% # orders data by total number of occurrences
saveRDS(object = speeches_by_speech, file.path("static", "putin-presser", "speeches_by_speech.rds"))
## this exports the documents as extracted; useful to check if the text has been imported correctly
# writeLines(paste("Speech:", unique(speeches$id), "\n", text, " ___ ______ ______ ______ ______ ______ ______ ______ ______ ___\n __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__\n (______)(______)(______)(______)(______)(______)(______)(______)(______)\n", sep = "\n"), file.path("allSpeeches.txt"))
## Wordcount per presser
```{r numberOfWordsPerPresser}
knitr::kable(speechesOriginal_byWord %>% group_by(id) %>% count())
speechesOriginal_byWord %>% group_by(id) %>% count() %>%
ggplot(mapping = aes(x = id, y = n)) +
geom_col() +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
theme_minimal() +
labs(title = "Number of words for each of Putin's annual news conference")
## What are the most frequent keywords across all pressers
Here are different ways to look at the words most frequently found in all annual news conference.
```{r mostFrequent_barchart_20}
mostFrequent_barchart_20 <-
speeches_by_speech %>%
group_by(word) %>%
arrange(desc(total)) %>%
group_by(id) %>%
top_n(30) %>%
arrange(total) %>%
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
ggplot(mapping = aes(x = word, y = total, label = n, fill = total)) +
geom_col() +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
scale_fill_continuous(low = "yellow", high = "red") +
coord_flip() +
theme_minimal() +
labs(title = "Most frequent keywords in Putin's pressers", fill = "Word frequency", caption = "Data processing: Giorgio Comai")
```{r mostFrequent_barchart__2017_20}
mostFrequent_barchart__2017_20 <-
speeches_by_speech %>%
filter(id == "Putin 2017") %>%
group_by(word) %>%
arrange(desc(n)) %>%
group_by(id) %>%
top_n(30) %>%
arrange(n) %>%
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
ggplot(mapping = aes(x = word, y = n, label = n, fill = n)) +
geom_col() +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
scale_fill_continuous(low = "yellow", high = "red") +
coord_flip() +
theme_minimal() +
labs(title = "Most frequent keywords in Putin's 2017 presser", fill = "Word frequency", caption = "Data processing: Giorgio Comai")
```{r mostFrequent_barchart__2017_totalcolor_20}
mostFrequent_barchart__2017_totalcolor_20 <-
speeches_by_speech %>%
filter(id == "Putin 2017") %>%
group_by(word) %>%
arrange(desc(n)) %>%
group_by(id) %>%
top_n(30) %>%
arrange(n) %>%
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
ggplot(mapping = aes(x = word, y = n, label = n, fill = total)) +
geom_col() +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
scale_fill_continuous(low = "yellow", high = "red") +
coord_flip() +
theme_minimal() +
labs(title = "Most frequent keywords in Putin's 2017 annual presser", fill = "Word frequency in\nPutin's pressers", subtitle = "Terms are ordered by their frequency in the 2017 speech, but colour coded according\nto their frequency in all pressers (yellow terms on top are unusually frequent in 2017)", caption = "Data processing: Giorgio Comai")
```{r keywords_circles_mostFrequent, fig.width = 8, fig.height = 6}
keywords_circles_mostFrequent <- speeches_by_speech %>%
group_by(word) %>%
arrange(desc(total)) %>%
group_by(id) %>% top_n(10) %>%
arrange(total) %>%
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) +
geom_point(shape = 21, stroke = 3, size = 10) +
geom_text(mapping = aes(size = 25), colour = "black") +
scale_colour_continuous(low = "yellow", high = "red")+
scale_x_discrete(name = "") +
scale_y_discrete(name = "") +
coord_flip() +
theme_minimal() +
theme(legend.position="none") +
labs(title = "Most frequent keywords in Putin's pressers", caption = "Data processing: Giorgio Comai")
## Wordclouds
```{r wordcloud_all_pressers, fig.height=10, fig.width=10}
speeches %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 150,
random.order = FALSE,
random.color = FALSE,
colors = TRUE, vfont=c("serif","plain")))
## What are the most distinctive words of each presser?
The following wordcloud compares the frequency of words across speeches, showing the words that are most frequent in each of the speech and are at the same time less frequently found in other speeches.
```{r wordcloud_all_pressers_byspeech_ss, fig.height=13, fig.width=12}
speeches %>%
anti_join(stop_words) %>%
group_by(id) %>%
count(word) %>%
arrange(desc(id)) %>%
spread(id, n) %>%
select(word, `Putin 2012`, `Putin 2017`, `Putin 2016`, `Putin 2015`, `Putin 2014`, `Putin 2013`) %>%
remove_rownames() %>%
column_to_rownames("word") %>%
replace(, 0) %>% = 300, colors=brewer.pal(6, "Dark2"),
vfont=c("sans serif","plain"))
## Selected keywords
```{r keywords_circles}
keywords_circles <- speeches_by_speech %>%
filter(word == "crimea" | word == "ukraine" |word == "europe" | word == "motherland"| word == "syria" | word == "citizens" | word == "jobs" | word == "growth" ) %>%
mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) +
geom_point(shape = 21, stroke = 3, size = 10) +
geom_text(mapping = aes(size = 25), colour = "black") +
scale_colour_continuous(low = "yellow", high = "red")+
scale_x_discrete(name = "") +
scale_y_discrete(name = "") +
coord_flip() +
theme_minimal() +
theme(legend.position="none") +
labs(title = "Frequency of selected keywords in Putin's annual pressers")
## Words with positive/negative connotation
The following wordclouds highlight the frequency of words which have a positive and negative connotation according to the lexicon by [Bing Liu and collaborators](
### All pressers
```{r wordcloud_allPressers_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
### Putin 2012
```{r wordcloud_Putin2012_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
filter(id == "Putin 2012") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
### Putin 2013
```{r wordcloud_Putin2013_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
filter(id == "Putin 2013") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
### Putin 2014
```{r wordcloud_Putin2014_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
filter(id == "Putin 2014") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
### Putin 2015
```{r wordcloud_Putin2015_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
filter(id == "Putin 2015") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
### Putin 2016
```{r wordcloud_putin2016_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
filter(id == "Putin 2016") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
### Putin 2017
```{r wordcloud_putin2017_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
filter(id == "Putin 2016") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% = c("#F8766D", "#00BFC4"),
max.words = 100, vfont=c("serif","plain"))
## What about the "people"
This wordcloud is based on all words included in the same sentence as "people".
```{r wordcloud_crisis}
speeches_by_sentence <- bind_cols(id = paste(links$President, links$Year), links, txt = gsub(pattern = ".", replacement = ". ", x = text, fixed = TRUE)) %>% unnest_tokens(sentence, txt, to_lower = TRUE, token = "sentences")
# storing for next post
saveRDS(object = speeches_by_sentence, file.path("static", "putin-presser", "speeches_by_sentence.rds"))
speeches_by_sentence %>%
filter(stringr::str_detect(string = sentence, pattern = "people")) %>%
unnest_tokens(input = sentence, output = word) %>%
anti_join(stop_words) %>%
count(word) %>% with(wordcloud(word, n, scale = c(3, 1), max.words = 50,
random.order = FALSE, vfont=c("sans serif","plain")))
DT::datatable(speeches_by_sentence %>%
filter(stringr::str_detect(string = sentence, pattern = "people")) %>%
select(Year, sentence)%>%
arrange(Year) %>%
mutate(sentence = stringr::str_replace_all(pattern = stringr::regex("people", ignore_case = TRUE),
replacement = paste0("<span style='background-color: #FFFF00'>", "people", "</span>"),
string = sentence)),
escape = FALSE,
options = list(pageLength = 5, lengthMenu = c(3, 5, 10, 15, 20)))
## About the author
Giorgio Comai is a researcher at [OBC Transeuropa](, crunching data with [EdjNet](, previously at Dublin City University. He is
[on Twitter]( His website is
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment