earino/motivating_example_weinstein.R

## motivating_example_weinstein.R
# From the blog post on the Weinstein Effect
# https://www.gokhanciflikli.com/post/weinstein-effect/

library(GuardianR)
library(stringr)
library(tidyverse)
library(tidytext)
library(lubridate)
library(rvest)
library(ggplot2)
library(ggrepel)
library(scales)

Sys.setenv(TZ='Europe/Budapest') #IMPORTANT

todays_date <- as.character(as.Date(now()))
start_date <- as.character(as.Date(now() - years(5)))

guardian_access_key <- Sys.getenv("GUARDIAN_ACCESS_KEY")

if (! file.exists("articles.csv")) {
  articles <- get_guardian(keywords = "sexual+harassment",
                           section = "world",
                           from.date = start_date,
                           to.date = todays_date,
                           api.key = guardian_access_key)

  write_csv(articles, "articles.csv")
}

fix_apos <- c("hasn", "hadn", "doesn", "didn", "isn", "wasn", "couldn", "wouldn")

articles <- read_csv("articles.csv") %>%
  select(webPublicationDate, body) %>%
  mutate(body = iconv(body, "", "ASCII", "byte")) %>%
  mutate(body = gsub("<.*?>", "", body)) %>%
  mutate(before = ifelse(webPublicationDate < "2017-10-05", TRUE, FALSE)) %>%
  unnest_tokens(bigram, body, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), remove=FALSE, sep = " ") %>%
  filter(word1 %in% c("he", "she")) %>%
  mutate(word2 = ifelse(word2 %in% fix_apos, str_c(word2, "t"), word2)) %>%
  group_by(before) %>%
  count(word1, word2) %>%
  spread(word1, n, fill = 0) %>%
  mutate(total = he + she,
         he = (he + 1) / sum(he + 1),
         she = (she + 1) / sum(she + 1),
         log.ratio = log2(she / he),
         abs.ratio = abs(log.ratio)) %>%
  arrange(desc(log.ratio))

articles %>%
  filter(before == TRUE) %>%
  filter(!word2 %in% c("himself", "herself", "ever", "quickly",
                       "actually", "sexually", "allegedly", "have"),
         total >= 5) %>%
  group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>%
  top_n(15, abs.ratio) %>%
  ungroup() %>%
  mutate(word2 = reorder(word2, log.ratio)) %>%
  ggplot(aes(word2, log.ratio, fill = direction)) +
  geom_col() +
  coord_flip() +
  labs(x = "",
       y = 'Relative appearance after "she" compared to "he"',
       fill = "",
       title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment",
       subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") +
  scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"),
                     breaks = seq(-4, 4)) +
  guides(fill = guide_legend(reverse = TRUE)) +
  expand_limits(y = c(-4, 4))

articles %>%
  filter(before == TRUE) %>%
  filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"),
         total >= 10) %>%
  top_n(100, abs.ratio) %>%
  ggplot(aes(total, log.ratio)) +
  geom_point() +
  geom_vline(xintercept = 5, color = "NA") +
  geom_hline(yintercept = 0, color = "red") +
  scale_x_log10(breaks = c(10, 100, 1000)) +
  geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) +
  scale_y_continuous(breaks = seq(-4, 4),
                     labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same",
                                '2X "she"', '4X "she"', '6X "she"', '8X "she"')) +
  labs(x = 'Total uses after "he" or "she" (Logarithmic scale)',
       y = 'Relative uses after "she" to after "he"',
       title = "Gendered Reporting: Pre Weinstein, The Guardian",
       subtitle = "Words occurring at least 10 times after he/she:
       160 unique words (100 displayed) | 11,013 occurrences in total") +
  expand_limits(y = c(4, -4))


articles %>%
  filter(before == FALSE) %>%
  filter(!word2 %in% c("himself", "herself", "ever", "quickly",
                       "actually", "sexually", "allegedly", "have"),
         total >= 5) %>%
  group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>%
  top_n(15, abs.ratio) %>%
  ungroup() %>%
  mutate(word2 = reorder(word2, log.ratio)) %>%
  ggplot(aes(word2, log.ratio, fill = direction)) +
  geom_col() +
  coord_flip() +
  labs(x = "",
       y = 'Relative appearance after "she" compared to "he"',
       fill = "",
       title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment",
       subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") +
  scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"),
                     breaks = seq(-4, 4)) +
  guides(fill = guide_legend(reverse = TRUE)) +
  expand_limits(y = c(-4, 4))

articles %>%
  filter(before == FALSE) %>%
  filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"),
         total >= 2) %>%
  top_n(100, abs.ratio) %>%
  ggplot(aes(total, log.ratio)) +
  geom_point() +
  geom_vline(xintercept = 5, color = "NA") +
  geom_hline(yintercept = 0, color = "red") +
  scale_x_log10(breaks = c(10, 100, 1000)) +
  geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) +
  scale_y_continuous(breaks = seq(-4, 4),
                     labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same",
                                '2X "she"', '4X "she"', '6X "she"', '8X "she"')) +
  labs(x = 'Total uses after "he" or "she" (Logarithmic scale)',
       y = 'Relative uses after "she" to after "he"',
       title = "Gendered Reporting: Pre Weinstein, The Guardian",
       subtitle = "Words occurring at least 10 times after he/she:
       160 unique words (100 displayed) | 11,013 occurrences in total") +
  expand_limits(y = c(4, -4))
	# From the blog post on the Weinstein Effect
	# https://www.gokhanciflikli.com/post/weinstein-effect/

	library(GuardianR)
	library(stringr)
	library(tidyverse)
	library(tidytext)
	library(lubridate)
	library(rvest)
	library(ggplot2)
	library(ggrepel)
	library(scales)

	Sys.setenv(TZ='Europe/Budapest') #IMPORTANT

	todays_date <- as.character(as.Date(now()))
	start_date <- as.character(as.Date(now() - years(5)))

	guardian_access_key <- Sys.getenv("GUARDIAN_ACCESS_KEY")

	if (! file.exists("articles.csv")) {
	articles <- get_guardian(keywords = "sexual+harassment",
	section = "world",
	from.date = start_date,
	to.date = todays_date,
	api.key = guardian_access_key)

	write_csv(articles, "articles.csv")
	}

	fix_apos <- c("hasn", "hadn", "doesn", "didn", "isn", "wasn", "couldn", "wouldn")

	articles <- read_csv("articles.csv") %>%
	select(webPublicationDate, body) %>%
	mutate(body = iconv(body, "", "ASCII", "byte")) %>%
	mutate(body = gsub("<.*?>", "", body)) %>%
	mutate(before = ifelse(webPublicationDate < "2017-10-05", TRUE, FALSE)) %>%
	unnest_tokens(bigram, body, token = "ngrams", n = 2) %>%
	separate(bigram, c("word1", "word2"), remove=FALSE, sep = " ") %>%
	filter(word1 %in% c("he", "she")) %>%
	mutate(word2 = ifelse(word2 %in% fix_apos, str_c(word2, "t"), word2)) %>%
	group_by(before) %>%
	count(word1, word2) %>%
	spread(word1, n, fill = 0) %>%
	mutate(total = he + she,
	he = (he + 1) / sum(he + 1),
	she = (she + 1) / sum(she + 1),
	log.ratio = log2(she / he),
	abs.ratio = abs(log.ratio)) %>%
	arrange(desc(log.ratio))

	articles %>%
	filter(before == TRUE) %>%
	filter(!word2 %in% c("himself", "herself", "ever", "quickly",
	"actually", "sexually", "allegedly", "have"),
	total >= 5) %>%
	group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>%
	top_n(15, abs.ratio) %>%
	ungroup() %>%
	mutate(word2 = reorder(word2, log.ratio)) %>%
	ggplot(aes(word2, log.ratio, fill = direction)) +
	geom_col() +
	coord_flip() +
	labs(x = "",
	y = 'Relative appearance after "she" compared to "he"',
	fill = "",
	title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment",
	subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") +
	scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"),
	breaks = seq(-4, 4)) +
	guides(fill = guide_legend(reverse = TRUE)) +
	expand_limits(y = c(-4, 4))

	articles %>%
	filter(before == TRUE) %>%
	filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"),
	total >= 10) %>%
	top_n(100, abs.ratio) %>%
	ggplot(aes(total, log.ratio)) +
	geom_point() +
	geom_vline(xintercept = 5, color = "NA") +
	geom_hline(yintercept = 0, color = "red") +
	scale_x_log10(breaks = c(10, 100, 1000)) +
	geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) +
	scale_y_continuous(breaks = seq(-4, 4),
	labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same",
	'2X "she"', '4X "she"', '6X "she"', '8X "she"')) +
	labs(x = 'Total uses after "he" or "she" (Logarithmic scale)',
	y = 'Relative uses after "she" to after "he"',
	title = "Gendered Reporting: Pre Weinstein, The Guardian",
	subtitle = "Words occurring at least 10 times after he/she:
	160 unique words (100 displayed) \| 11,013 occurrences in total") +
	expand_limits(y = c(4, -4))


	articles %>%
	filter(before == FALSE) %>%
	filter(!word2 %in% c("himself", "herself", "ever", "quickly",
	"actually", "sexually", "allegedly", "have"),
	total >= 5) %>%
	group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>%
	top_n(15, abs.ratio) %>%
	ungroup() %>%
	mutate(word2 = reorder(word2, log.ratio)) %>%
	ggplot(aes(word2, log.ratio, fill = direction)) +
	geom_col() +
	coord_flip() +
	labs(x = "",
	y = 'Relative appearance after "she" compared to "he"',
	fill = "",
	title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment",
	subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") +
	scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"),
	breaks = seq(-4, 4)) +
	guides(fill = guide_legend(reverse = TRUE)) +
	expand_limits(y = c(-4, 4))

	articles %>%
	filter(before == FALSE) %>%
	filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"),
	total >= 2) %>%
	top_n(100, abs.ratio) %>%
	ggplot(aes(total, log.ratio)) +
	geom_point() +
	geom_vline(xintercept = 5, color = "NA") +
	geom_hline(yintercept = 0, color = "red") +
	scale_x_log10(breaks = c(10, 100, 1000)) +
	geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) +
	scale_y_continuous(breaks = seq(-4, 4),
	labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same",
	'2X "she"', '4X "she"', '6X "she"', '8X "she"')) +
	labs(x = 'Total uses after "he" or "she" (Logarithmic scale)',
	y = 'Relative uses after "she" to after "he"',
	title = "Gendered Reporting: Pre Weinstein, The Guardian",
	subtitle = "Words occurring at least 10 times after he/she:
	160 unique words (100 displayed) \| 11,013 occurrences in total") +
	expand_limits(y = c(4, -4))