giocomai/Exploring Putin’s annual news conference.Rmd

## Exploring Putin’s annual news conference.Rmd
---
title: "Exploring Putin's annual news conference"
author: "Giorgio Comai"
date: '2017-12-15'
---


```{r setup, include=FALSE, echo=TRUE, message=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)

if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading

pacman::p_load("tidyverse")  # data processing
pacman::p_load("reshape2") # data processing
pacman::p_load("rvest") # extracting from html
pacman::p_load("tidytext") # transforming text into tidy data frames
pacman::p_load("DT") # interactive table
pacman::p_load("knitr") # rendering this document
pacman::p_load("wordcloud") # you guessed it
```

## Downloading the speeches

The full transcript of the Russian president's annual news conference is available on [the official website of the Kremlin](http://en.kremlin.ru/), translated in English. Here are the direct links to each of the speeches:

```{r}
# introduce links to speeches

links <- tribble(~Year, ~President, ~Link,
                 2017, "Putin", "http://en.kremlin.ru/events/president/news/56378",
                 2016, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/53573",
                 2015, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/50971",
                 2014, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/47250",
                 2013, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/19859",
                 2012, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/17173"
                 )

# store html files locally
dir.create(path = file.path("static", "putin-presser"), showWarnings = FALSE)
dir.create(path = file.path("static", "putin-presser", "html"), showWarnings = FALSE)

# add local link to data.frame
links <- dplyr::bind_cols(links, data_frame(LocalLink = file.path("static", "putin-presser", "html", paste0(links$Year, "-", links$President, ".html"))))

for (i in seq_along(along.with = links$Link)) {
  if (file.exists(links$LocalLink[i]) == FALSE) { # download files only if not previously downloaded
    download.file(url = links$Link[i], destfile = links$LocalLink[i])
    Sys.sleep(time = 1)
  }
}
knitr::kable(links %>% select("Year",	"President",	"Link"))
```


```{r}
# extract speeches

text <- purrr::map_chr(.x = links$LocalLink, .f = function(x) read_html(x) %>% rvest::html_nodes(xpath = "//div[@class='read__content']") %>%  html_text())

```

```{r}

text <- gsub(pattern = "issues", replacement = "issue", x = text)
text <- gsub(pattern = "citizens", replacement = "citizen", x = text)
text <- gsub(pattern = "it’s", replacement = "", x = text)
text <- gsub(pattern = "don't", replacement = "", x = text)
text <- gsub(pattern = "don’t", replacement = "", x = text)
text <- gsub(pattern = "I’m", replacement = "", x = text)
text <- gsub(pattern = "Vladimir", replacement = "", x = text)
text <- gsub(pattern = "Putin", replacement = "", x = text)
text <- gsub(pattern = "question", replacement = "", x = text)
text <- gsub(pattern = "Question", replacement = "", x = text)

speechesOriginal <- data_frame(id = paste(links$President, links$Year), txt = text)

speechesOriginal_byWord <- speechesOriginal %>% unnest_tokens(word, txt, to_lower = TRUE)

speeches <- speechesOriginal %>%
  unnest_tokens(word, txt, to_lower = TRUE) %>%
  anti_join(stop_words) # removing a standard list of stopwords


speeches_by_speech <- speeches %>%
    anti_join(stop_words) %>% # removes stop words
    group_by(id, word) %>% # makes sure wordcount is calculated on a per speech basis
    count(word, sort = TRUE) %>% # counts words
    group_by(word) %>%
    mutate(total = (sum(n))) %>% # counts total number of occurrences of every word in all speeches
    arrange(total) %>% # orders data by total number of occurrences
  ungroup()


saveRDS(object = speeches_by_speech, file.path("static", "putin-presser", "speeches_by_speech.rds"))

## this exports the documents as extracted; useful to check if the text has been imported correctly

# writeLines(paste("Speech:", unique(speeches$id), "\n", text, " ___  ______  ______  ______  ______  ______  ______  ______  ______  ___\n  __)(__  __)(__  __)(__  __)(__  __)(__  __)(__  __)(__  __)(__  __)(__\n (______)(______)(______)(______)(______)(______)(______)(______)(______)\n", sep = "\n"), file.path("allSpeeches.txt"))

```

## Wordcount per presser


```{r numberOfWordsPerPresser}
knitr::kable(speechesOriginal_byWord %>% group_by(id) %>% count())

speechesOriginal_byWord %>% group_by(id) %>% count() %>%
  ggplot(mapping = aes(x = id, y = n)) +
  geom_col() +
  scale_x_discrete(name = "") +
  scale_y_continuous(name = "") +
  theme_minimal() +
  labs(title = "Number of words for each of Putin's annual news conference")

```

## What are the most frequent keywords across all pressers

Here are different ways to look at the words most frequently found in all annual news conference.

```{r mostFrequent_barchart_20}
mostFrequent_barchart_20 <-
  speeches_by_speech %>%
  group_by(word) %>%
  arrange(desc(total)) %>%
  group_by(id) %>%
  top_n(30) %>%
  arrange(total) %>%
  mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
  ggplot(mapping = aes(x = word, y = total, label = n, fill = total)) +
  geom_col() +
  scale_x_discrete(name = "") +
  scale_y_continuous(name = "") +
  scale_fill_continuous(low = "yellow", high = "red") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Most frequent keywords in Putin's pressers", fill = "Word frequency", caption = "Data processing:  Giorgio Comai")
mostFrequent_barchart_20

```


```{r mostFrequent_barchart__2017_20}
mostFrequent_barchart__2017_20 <-
  speeches_by_speech %>%
  filter(id == "Putin 2017") %>%
  group_by(word) %>%
  arrange(desc(n)) %>%
  group_by(id) %>%
  top_n(30) %>%
  arrange(n) %>%
  mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
  ggplot(mapping = aes(x = word, y = n, label = n, fill = n)) +
  geom_col() +
  scale_x_discrete(name = "") +
  scale_y_continuous(name = "") +
  scale_fill_continuous(low = "yellow", high = "red") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Most frequent keywords in Putin's 2017 presser", fill = "Word frequency", caption = "Data processing:  Giorgio Comai")
mostFrequent_barchart__2017_20

```


```{r mostFrequent_barchart__2017_totalcolor_20}
mostFrequent_barchart__2017_totalcolor_20 <-
  speeches_by_speech %>%
  filter(id == "Putin 2017") %>%
  group_by(word) %>%
  arrange(desc(n)) %>%
  group_by(id) %>%
  top_n(30) %>%
  arrange(n) %>%
  mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
  ggplot(mapping = aes(x = word, y = n, label = n, fill = total)) +
  geom_col() +
  scale_x_discrete(name = "") +
  scale_y_continuous(name = "") +
  scale_fill_continuous(low = "yellow", high = "red") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Most frequent keywords in Putin's 2017 annual presser", fill = "Word frequency in\nPutin's pressers", subtitle = "Terms are ordered by their frequency in the 2017 speech, but colour coded according\nto their frequency in all pressers (yellow terms on top are unusually frequent in 2017)", caption = "Data processing:  Giorgio Comai")
mostFrequent_barchart__2017_totalcolor_20

```


```{r keywords_circles_mostFrequent, fig.width = 8, fig.height = 6}
keywords_circles_mostFrequent <- speeches_by_speech %>%
  group_by(word) %>%
  arrange(desc(total)) %>%
  group_by(id) %>% top_n(10) %>%
  arrange(total) %>%
  mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
  ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) +
  geom_point(shape = 21, stroke = 3, size = 10) +
  geom_text(mapping = aes(size = 25), colour = "black") +
  scale_colour_continuous(low = "yellow", high = "red")+
  scale_x_discrete(name = "") +
  scale_y_discrete(name = "") +
  coord_flip() +
  theme_minimal() +
  theme(legend.position="none") +
  labs(title = "Most frequent keywords in Putin's pressers", caption = "Data processing:  Giorgio Comai")

keywords_circles_mostFrequent

```

## Wordclouds

```{r wordcloud_all_pressers, fig.height=10, fig.width=10}

speeches %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 150,
                 random.order = FALSE,
                 random.color = FALSE,
                 colors = TRUE, vfont=c("serif","plain")))
```


## What are the most distinctive words of each presser?

The following wordcloud compares the frequency of words across speeches, showing the words that are most frequent in each of the speech and are at the same time less frequently found in other speeches.

```{r wordcloud_all_pressers_byspeech_ss, fig.height=13, fig.width=12}
speeches %>%
  anti_join(stop_words) %>%
  group_by(id) %>%
  count(word) %>%
  arrange(desc(id)) %>%
  spread(id, n) %>%
  select(word, `Putin 2012`, `Putin 2017`, `Putin 2016`, `Putin 2015`, `Putin 2014`, `Putin 2013`) %>%
  remove_rownames() %>%
  column_to_rownames("word") %>%
  replace(is.na(.), 0) %>%
  comparison.cloud(max.words = 300, colors=brewer.pal(6, "Dark2"),
                 vfont=c("sans serif","plain"))

```


## Selected keywords

```{r keywords_circles}
keywords_circles <- speeches_by_speech %>%
  filter(word == "crimea" | word == "ukraine" |word == "europe" | word == "motherland"| word == "syria" | word == "citizens" | word == "jobs" | word == "growth" ) %>%
  mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
  ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) +
  geom_point(shape = 21, stroke = 3, size = 10) +
  geom_text(mapping = aes(size = 25), colour = "black") +
  scale_colour_continuous(low = "yellow", high = "red")+
  scale_x_discrete(name = "") +
  scale_y_discrete(name = "") +
  coord_flip() +
  theme_minimal() +
  theme(legend.position="none") +
  labs(title = "Frequency of selected keywords in Putin's annual pressers")

keywords_circles
```


## Words with positive/negative connotation

The following wordclouds highlight the frequency of words which have a positive and negative connotation according to the lexicon by [Bing Liu and collaborators](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html).


### All pressers
```{r wordcloud_allPressers_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

speeches %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))
```


### Putin 2012
```{r wordcloud_Putin2012_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
speeches %>%
  filter(id == "Putin 2012") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))
```


### Putin 2013

```{r wordcloud_Putin2013_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

speeches %>%
  filter(id == "Putin 2013") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))

```

### Putin 2014

```{r wordcloud_Putin2014_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

speeches %>%
  filter(id == "Putin 2014") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))
```

### Putin 2015

```{r wordcloud_Putin2015_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

speeches %>%
  filter(id == "Putin 2015") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))

```

### Putin 2016

```{r wordcloud_putin2016_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

speeches %>%
  filter(id == "Putin 2016") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))
```


### Putin 2017

```{r wordcloud_putin2017_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

speeches %>%
  filter(id == "Putin 2016") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100,  vfont=c("serif","plain"))
```


## What about the "people"

This wordcloud is based on all words included in the same sentence as "people".

```{r wordcloud_crisis}
speeches_by_sentence <- bind_cols(id = paste(links$President, links$Year), links, txt = gsub(pattern = ".", replacement = ". ", x = text, fixed = TRUE)) %>% unnest_tokens(sentence, txt, to_lower = TRUE, token = "sentences")

# storing for next post
saveRDS(object = speeches_by_sentence, file.path("static", "putin-presser", "speeches_by_sentence.rds"))


speeches_by_sentence %>%
  filter(stringr::str_detect(string = sentence, pattern = "people")) %>%
  unnest_tokens(input = sentence, output = word) %>%
  anti_join(stop_words) %>%
  count(word) %>%   with(wordcloud(word, n, scale = c(3, 1), max.words = 50,
                 random.order = FALSE, vfont=c("sans serif","plain")))

DT::datatable(speeches_by_sentence %>%
  filter(stringr::str_detect(string = sentence, pattern = "people")) %>%
    select(Year, sentence)%>%
    arrange(Year) %>%
    mutate(sentence = stringr::str_replace_all(pattern = stringr::regex("people", ignore_case = TRUE),
           replacement = paste0("<span style='background-color: #FFFF00'>", "people", "</span>"),
           string = sentence)),
  escape = FALSE,
  options = list(pageLength = 5, lengthMenu = c(3, 5, 10, 15, 20)))
```


## About the author

Giorgio Comai is a researcher at [OBC Transeuropa](https://twitter.com/BalkansCaucasus), crunching data with [EdjNet](https://twitter.com/EdjNet), previously at Dublin City University. He is
[on Twitter](https://twitter.com/giocomai). His website is https://giorgiocomai.eu/.
	---
	title: "Exploring Putin's annual news conference"
	author: "Giorgio Comai"
	date: '2017-12-15'
	---


	```{r setup, include=FALSE, echo=TRUE, message=FALSE}
	knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)

	if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading

	pacman::p_load("tidyverse") # data processing
	pacman::p_load("reshape2") # data processing
	pacman::p_load("rvest") # extracting from html
	pacman::p_load("tidytext") # transforming text into tidy data frames
	pacman::p_load("DT") # interactive table
	pacman::p_load("knitr") # rendering this document
	pacman::p_load("wordcloud") # you guessed it
	```

	## Downloading the speeches

	The full transcript of the Russian president's annual news conference is available on [the official website of the Kremlin](http://en.kremlin.ru/), translated in English. Here are the direct links to each of the speeches:

	```{r}
	# introduce links to speeches

	links <- tribble(~Year, ~President, ~Link,
	2017, "Putin", "http://en.kremlin.ru/events/president/news/56378",
	2016, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/53573",
	2015, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/50971",
	2014, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/47250",
	2013, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/19859",
	2012, "Putin", "http://en.kremlin.ru/events/president/transcripts/press_conferences/17173"
	)

	# store html files locally
	dir.create(path = file.path("static", "putin-presser"), showWarnings = FALSE)
	dir.create(path = file.path("static", "putin-presser", "html"), showWarnings = FALSE)

	# add local link to data.frame
	links <- dplyr::bind_cols(links, data_frame(LocalLink = file.path("static", "putin-presser", "html", paste0(links$Year, "-", links$President, ".html"))))

	for (i in seq_along(along.with = links$Link)) {
	if (file.exists(links$LocalLink[i]) == FALSE) { # download files only if not previously downloaded
	download.file(url = links$Link[i], destfile = links$LocalLink[i])
	Sys.sleep(time = 1)
	}
	}
	knitr::kable(links %>% select("Year", "President", "Link"))
	```


	```{r}
	# extract speeches

	text <- purrr::map_chr(.x = links$LocalLink, .f = function(x) read_html(x) %>% rvest::html_nodes(xpath = "//div[@class='read__content']") %>% html_text())

	```

	```{r}

	text <- gsub(pattern = "issues", replacement = "issue", x = text)
	text <- gsub(pattern = "citizens", replacement = "citizen", x = text)
	text <- gsub(pattern = "it’s", replacement = "", x = text)
	text <- gsub(pattern = "don't", replacement = "", x = text)
	text <- gsub(pattern = "don’t", replacement = "", x = text)
	text <- gsub(pattern = "I’m", replacement = "", x = text)
	text <- gsub(pattern = "Vladimir", replacement = "", x = text)
	text <- gsub(pattern = "Putin", replacement = "", x = text)
	text <- gsub(pattern = "question", replacement = "", x = text)
	text <- gsub(pattern = "Question", replacement = "", x = text)

	speechesOriginal <- data_frame(id = paste(links$President, links$Year), txt = text)

	speechesOriginal_byWord <- speechesOriginal %>% unnest_tokens(word, txt, to_lower = TRUE)

	speeches <- speechesOriginal %>%
	unnest_tokens(word, txt, to_lower = TRUE) %>%
	anti_join(stop_words) # removing a standard list of stopwords


	speeches_by_speech <- speeches %>%
	anti_join(stop_words) %>% # removes stop words
	group_by(id, word) %>% # makes sure wordcount is calculated on a per speech basis
	count(word, sort = TRUE) %>% # counts words
	group_by(word) %>%
	mutate(total = (sum(n))) %>% # counts total number of occurrences of every word in all speeches
	arrange(total) %>% # orders data by total number of occurrences
	ungroup()


	saveRDS(object = speeches_by_speech, file.path("static", "putin-presser", "speeches_by_speech.rds"))

	## this exports the documents as extracted; useful to check if the text has been imported correctly

	# writeLines(paste("Speech:", unique(speeches$id), "\n", text, " ___ ______ ______ ______ ______ ______ ______ ______ ______ ___\n __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__\n (______)(______)(______)(______)(______)(______)(______)(______)(______)\n", sep = "\n"), file.path("allSpeeches.txt"))

	```

	## Wordcount per presser


	```{r numberOfWordsPerPresser}
	knitr::kable(speechesOriginal_byWord %>% group_by(id) %>% count())

	speechesOriginal_byWord %>% group_by(id) %>% count() %>%
	ggplot(mapping = aes(x = id, y = n)) +
	geom_col() +
	scale_x_discrete(name = "") +
	scale_y_continuous(name = "") +
	theme_minimal() +
	labs(title = "Number of words for each of Putin's annual news conference")

	```

	## What are the most frequent keywords across all pressers

	Here are different ways to look at the words most frequently found in all annual news conference.

	```{r mostFrequent_barchart_20}
	mostFrequent_barchart_20 <-
	speeches_by_speech %>%
	group_by(word) %>%
	arrange(desc(total)) %>%
	group_by(id) %>%
	top_n(30) %>%
	arrange(total) %>%
	mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
	ggplot(mapping = aes(x = word, y = total, label = n, fill = total)) +
	geom_col() +
	scale_x_discrete(name = "") +
	scale_y_continuous(name = "") +
	scale_fill_continuous(low = "yellow", high = "red") +
	coord_flip() +
	theme_minimal() +
	labs(title = "Most frequent keywords in Putin's pressers", fill = "Word frequency", caption = "Data processing: Giorgio Comai")
	mostFrequent_barchart_20

	```


	```{r mostFrequent_barchart__2017_20}
	mostFrequent_barchart__2017_20 <-
	speeches_by_speech %>%
	filter(id == "Putin 2017") %>%
	group_by(word) %>%
	arrange(desc(n)) %>%
	group_by(id) %>%
	top_n(30) %>%
	arrange(n) %>%
	mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
	ggplot(mapping = aes(x = word, y = n, label = n, fill = n)) +
	geom_col() +
	scale_x_discrete(name = "") +
	scale_y_continuous(name = "") +
	scale_fill_continuous(low = "yellow", high = "red") +
	coord_flip() +
	theme_minimal() +
	labs(title = "Most frequent keywords in Putin's 2017 presser", fill = "Word frequency", caption = "Data processing: Giorgio Comai")
	mostFrequent_barchart__2017_20

	```


	```{r mostFrequent_barchart__2017_totalcolor_20}
	mostFrequent_barchart__2017_totalcolor_20 <-
	speeches_by_speech %>%
	filter(id == "Putin 2017") %>%
	group_by(word) %>%
	arrange(desc(n)) %>%
	group_by(id) %>%
	top_n(30) %>%
	arrange(n) %>%
	mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
	ggplot(mapping = aes(x = word, y = n, label = n, fill = total)) +
	geom_col() +
	scale_x_discrete(name = "") +
	scale_y_continuous(name = "") +
	scale_fill_continuous(low = "yellow", high = "red") +
	coord_flip() +
	theme_minimal() +
	labs(title = "Most frequent keywords in Putin's 2017 annual presser", fill = "Word frequency in\nPutin's pressers", subtitle = "Terms are ordered by their frequency in the 2017 speech, but colour coded according\nto their frequency in all pressers (yellow terms on top are unusually frequent in 2017)", caption = "Data processing: Giorgio Comai")
	mostFrequent_barchart__2017_totalcolor_20

	```


	```{r keywords_circles_mostFrequent, fig.width = 8, fig.height = 6}
	keywords_circles_mostFrequent <- speeches_by_speech %>%
	group_by(word) %>%
	arrange(desc(total)) %>%
	group_by(id) %>% top_n(10) %>%
	arrange(total) %>%
	mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
	ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) +
	geom_point(shape = 21, stroke = 3, size = 10) +
	geom_text(mapping = aes(size = 25), colour = "black") +
	scale_colour_continuous(low = "yellow", high = "red")+
	scale_x_discrete(name = "") +
	scale_y_discrete(name = "") +
	coord_flip() +
	theme_minimal() +
	theme(legend.position="none") +
	labs(title = "Most frequent keywords in Putin's pressers", caption = "Data processing: Giorgio Comai")

	keywords_circles_mostFrequent

	```

	## Wordclouds

	```{r wordcloud_all_pressers, fig.height=10, fig.width=10}

	speeches %>%
	anti_join(stop_words) %>%
	count(word) %>%
	with(wordcloud(word, n, max.words = 150,
	random.order = FALSE,
	random.color = FALSE,
	colors = TRUE, vfont=c("serif","plain")))
	```


	## What are the most distinctive words of each presser?

	The following wordcloud compares the frequency of words across speeches, showing the words that are most frequent in each of the speech and are at the same time less frequently found in other speeches.

	```{r wordcloud_all_pressers_byspeech_ss, fig.height=13, fig.width=12}
	speeches %>%
	anti_join(stop_words) %>%
	group_by(id) %>%
	count(word) %>%
	arrange(desc(id)) %>%
	spread(id, n) %>%
	select(word, `Putin 2012`, `Putin 2017`, `Putin 2016`, `Putin 2015`, `Putin 2014`, `Putin 2013`) %>%
	remove_rownames() %>%
	column_to_rownames("word") %>%
	replace(is.na(.), 0) %>%
	comparison.cloud(max.words = 300, colors=brewer.pal(6, "Dark2"),
	vfont=c("sans serif","plain"))

	```


	## Selected keywords

	```{r keywords_circles}
	keywords_circles <- speeches_by_speech %>%
	filter(word == "crimea" \| word == "ukraine" \|word == "europe" \| word == "motherland"\| word == "syria" \| word == "citizens" \| word == "jobs" \| word == "growth" ) %>%
	mutate(word = factor(word, levels = unique(word))) %>% # makes sure order is kept for graph
	ggplot(mapping = aes(x = word, y = id, size = n, colour = n, label = n)) +
	geom_point(shape = 21, stroke = 3, size = 10) +
	geom_text(mapping = aes(size = 25), colour = "black") +
	scale_colour_continuous(low = "yellow", high = "red")+
	scale_x_discrete(name = "") +
	scale_y_discrete(name = "") +
	coord_flip() +
	theme_minimal() +
	theme(legend.position="none") +
	labs(title = "Frequency of selected keywords in Putin's annual pressers")

	keywords_circles
	```


	## Words with positive/negative connotation

	The following wordclouds highlight the frequency of words which have a positive and negative connotation according to the lexicon by [Bing Liu and collaborators](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html).


	### All pressers
	```{r wordcloud_allPressers_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

	speeches %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))
	```


	### Putin 2012
	```{r wordcloud_Putin2012_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}
	speeches %>%
	filter(id == "Putin 2012") %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))
	```


	### Putin 2013

	```{r wordcloud_Putin2013_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

	speeches %>%
	filter(id == "Putin 2013") %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))

	```

	### Putin 2014

	```{r wordcloud_Putin2014_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

	speeches %>%
	filter(id == "Putin 2014") %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))
	```

	### Putin 2015

	```{r wordcloud_Putin2015_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

	speeches %>%
	filter(id == "Putin 2015") %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))

	```

	### Putin 2016

	```{r wordcloud_putin2016_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

	speeches %>%
	filter(id == "Putin 2016") %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))
	```


	### Putin 2017

	```{r wordcloud_putin2017_sentiment, fig.height=5.8, fig.width=5.8, include=TRUE}

	speeches %>%
	filter(id == "Putin 2016") %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#00BFC4"),
	max.words = 100, vfont=c("serif","plain"))
	```


	## What about the "people"

	This wordcloud is based on all words included in the same sentence as "people".

	```{r wordcloud_crisis}
	speeches_by_sentence <- bind_cols(id = paste(links$President, links$Year), links, txt = gsub(pattern = ".", replacement = ". ", x = text, fixed = TRUE)) %>% unnest_tokens(sentence, txt, to_lower = TRUE, token = "sentences")

	# storing for next post
	saveRDS(object = speeches_by_sentence, file.path("static", "putin-presser", "speeches_by_sentence.rds"))


	speeches_by_sentence %>%
	filter(stringr::str_detect(string = sentence, pattern = "people")) %>%
	unnest_tokens(input = sentence, output = word) %>%
	anti_join(stop_words) %>%
	count(word) %>% with(wordcloud(word, n, scale = c(3, 1), max.words = 50,
	random.order = FALSE, vfont=c("sans serif","plain")))

	DT::datatable(speeches_by_sentence %>%
	filter(stringr::str_detect(string = sentence, pattern = "people")) %>%
	select(Year, sentence)%>%
	arrange(Year) %>%
	mutate(sentence = stringr::str_replace_all(pattern = stringr::regex("people", ignore_case = TRUE),
	replacement = paste0("<span style='background-color: #FFFF00'>", "people", "</span>"),
	string = sentence)),
	escape = FALSE,
	options = list(pageLength = 5, lengthMenu = c(3, 5, 10, 15, 20)))
	```



	## About the author

	Giorgio Comai is a researcher at [OBC Transeuropa](https://twitter.com/BalkansCaucasus), crunching data with [EdjNet](https://twitter.com/EdjNet), previously at Dublin City University. He is
	[on Twitter](https://twitter.com/giocomai). His website is https://giorgiocomai.eu/.