Skip to content

Instantly share code, notes, and snippets.

@earino
Created February 16, 2018 16:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save earino/9d15c361d0c3b5523989463c02d80a3e to your computer and use it in GitHub Desktop.
Save earino/9d15c361d0c3b5523989463c02d80a3e to your computer and use it in GitHub Desktop.
# From the blog post on the Weinstein Effect
# https://www.gokhanciflikli.com/post/weinstein-effect/
library(GuardianR)
library(stringr)
library(tidyverse)
library(tidytext)
library(lubridate)
library(rvest)
library(ggplot2)
library(ggrepel)
library(scales)
Sys.setenv(TZ='Europe/Budapest') #IMPORTANT
todays_date <- as.character(as.Date(now()))
start_date <- as.character(as.Date(now() - years(5)))
guardian_access_key <- Sys.getenv("GUARDIAN_ACCESS_KEY")
if (! file.exists("articles.csv")) {
articles <- get_guardian(keywords = "sexual+harassment",
section = "world",
from.date = start_date,
to.date = todays_date,
api.key = guardian_access_key)
write_csv(articles, "articles.csv")
}
fix_apos <- c("hasn", "hadn", "doesn", "didn", "isn", "wasn", "couldn", "wouldn")
articles <- read_csv("articles.csv") %>%
select(webPublicationDate, body) %>%
mutate(body = iconv(body, "", "ASCII", "byte")) %>%
mutate(body = gsub("<.*?>", "", body)) %>%
mutate(before = ifelse(webPublicationDate < "2017-10-05", TRUE, FALSE)) %>%
unnest_tokens(bigram, body, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), remove=FALSE, sep = " ") %>%
filter(word1 %in% c("he", "she")) %>%
mutate(word2 = ifelse(word2 %in% fix_apos, str_c(word2, "t"), word2)) %>%
group_by(before) %>%
count(word1, word2) %>%
spread(word1, n, fill = 0) %>%
mutate(total = he + she,
he = (he + 1) / sum(he + 1),
she = (she + 1) / sum(she + 1),
log.ratio = log2(she / he),
abs.ratio = abs(log.ratio)) %>%
arrange(desc(log.ratio))
articles %>%
filter(before == TRUE) %>%
filter(!word2 %in% c("himself", "herself", "ever", "quickly",
"actually", "sexually", "allegedly", "have"),
total >= 5) %>%
group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>%
top_n(15, abs.ratio) %>%
ungroup() %>%
mutate(word2 = reorder(word2, log.ratio)) %>%
ggplot(aes(word2, log.ratio, fill = direction)) +
geom_col() +
coord_flip() +
labs(x = "",
y = 'Relative appearance after "she" compared to "he"',
fill = "",
title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment",
subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") +
scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"),
breaks = seq(-4, 4)) +
guides(fill = guide_legend(reverse = TRUE)) +
expand_limits(y = c(-4, 4))
articles %>%
filter(before == TRUE) %>%
filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"),
total >= 10) %>%
top_n(100, abs.ratio) %>%
ggplot(aes(total, log.ratio)) +
geom_point() +
geom_vline(xintercept = 5, color = "NA") +
geom_hline(yintercept = 0, color = "red") +
scale_x_log10(breaks = c(10, 100, 1000)) +
geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) +
scale_y_continuous(breaks = seq(-4, 4),
labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same",
'2X "she"', '4X "she"', '6X "she"', '8X "she"')) +
labs(x = 'Total uses after "he" or "she" (Logarithmic scale)',
y = 'Relative uses after "she" to after "he"',
title = "Gendered Reporting: Pre Weinstein, The Guardian",
subtitle = "Words occurring at least 10 times after he/she:
160 unique words (100 displayed) | 11,013 occurrences in total") +
expand_limits(y = c(4, -4))
articles %>%
filter(before == FALSE) %>%
filter(!word2 %in% c("himself", "herself", "ever", "quickly",
"actually", "sexually", "allegedly", "have"),
total >= 5) %>%
group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>%
top_n(15, abs.ratio) %>%
ungroup() %>%
mutate(word2 = reorder(word2, log.ratio)) %>%
ggplot(aes(word2, log.ratio, fill = direction)) +
geom_col() +
coord_flip() +
labs(x = "",
y = 'Relative appearance after "she" compared to "he"',
fill = "",
title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment",
subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") +
scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"),
breaks = seq(-4, 4)) +
guides(fill = guide_legend(reverse = TRUE)) +
expand_limits(y = c(-4, 4))
articles %>%
filter(before == FALSE) %>%
filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"),
total >= 2) %>%
top_n(100, abs.ratio) %>%
ggplot(aes(total, log.ratio)) +
geom_point() +
geom_vline(xintercept = 5, color = "NA") +
geom_hline(yintercept = 0, color = "red") +
scale_x_log10(breaks = c(10, 100, 1000)) +
geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) +
scale_y_continuous(breaks = seq(-4, 4),
labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same",
'2X "she"', '4X "she"', '6X "she"', '8X "she"')) +
labs(x = 'Total uses after "he" or "she" (Logarithmic scale)',
y = 'Relative uses after "she" to after "he"',
title = "Gendered Reporting: Pre Weinstein, The Guardian",
subtitle = "Words occurring at least 10 times after he/she:
160 unique words (100 displayed) | 11,013 occurrences in total") +
expand_limits(y = c(4, -4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment