Created
May 13, 2017 15:50
-
-
Save tcash21/826e167d56e17b505cce519162c7028b to your computer and use it in GitHub Desktop.
OpenVisConf Transcript Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(tidytext) | |
library(stringr) | |
library(purrr) | |
library(ggplot2) | |
library(tidyr) | |
options(stringsAsFactors = FALSE) | |
## use phantomjs to download the webpage content (javascript-generated) | |
system("phantomjs scrape.js") | |
h <- html("openviz.txt") | |
## get all <a> tags | |
links <- h %>% html_nodes('a') | |
## map them to a data.frame using purrr | |
links_df <- | |
links %>% | |
map(xml_attrs) %>% | |
map_df(~as.list(.)) | |
## grep out just the transcript URLs | |
transcript_urls <- links_df[grep("transcripts", links_df$href),]$href | |
## store them into a list | |
base_url <- 'https://openvisconf.com' | |
transcripts <- paste0(base_url, transcript_urls) | |
transcript_txts<-sapply(transcripts, function(x) readLines(x, encoding = "UTF-8")) | |
## Assign genders to the speakers | |
genders<-data.frame(transcript = transcripts, gender="M") | |
genders[grep("Shirley|Amanda|Lisa|Julia|Amy|Amelia", genders$transcript),]$gender <- 'F' | |
## Remove this talk because it has male and female speakers and no way to differentiate | |
i <- grep("Ignazio", genders$transcript) | |
genders <- genders[-i,] | |
transcript_txts <- transcript_txts[-i] | |
## separate transcripts out by gender | |
female_dfs <- lapply(transcript_txts[genders$gender == 'F'], function(x) data_frame(txt=x)) | |
male_dfs <- lapply(transcript_txts[genders$gender == 'M'], function(x) data_frame(txt=x)) | |
female_dfs_all <- do.call('rbind', female_dfs) | |
male_dfs_all <- do.call('rbind', male_dfs) | |
data(stop_words) | |
## 1grams for women | |
f_1grams <- | |
female_dfs_all %>% | |
unnest_tokens(word, txt) %>% | |
anti_join(stop_words) | |
## top 10 1grams for women | |
f_top10 <- f_1grams %>% | |
count(word, sort = TRUE) %>% | |
mutate(freq = n/as.integer(count(f_1grams))) %>% | |
head(10) | |
## Plot most commonly used words by women speakers | |
f_1grams %>% | |
count(word, sort = TRUE) %>% | |
filter(n > 25) %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n)) + | |
geom_col() + | |
xlab(NULL) + | |
coord_flip() | |
## 1grams for men | |
m_1grams <- | |
male_dfs_all %>% | |
unnest_tokens(word, txt) %>% | |
anti_join(stop_words) | |
## top 10 1grams by men | |
m_top10 <- m_1grams %>% | |
count(word, sort = TRUE) %>% | |
mutate(freq = n/as.integer(count(m_1grams))) %>% | |
head(10) | |
## Plot most commonly used words by male speakers | |
m_1grams %>% | |
count(word, sort = TRUE) %>% | |
filter(n > 50) %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n)) + | |
geom_col() + | |
xlab(NULL) + | |
coord_flip() | |
## Genders UNITE! | |
genders_united <- bind_rows(mutate(f_1grams, gender = "F"), | |
mutate(m_1grams, gender = "M")) | |
genders_united %>% | |
count(word, sort = TRUE) %>% | |
filter(n > 50) %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n)) + | |
geom_col() + | |
xlab(NULL) + | |
coord_flip() | |
## calculate tf_idf | |
gu_tf_idf <- genders_united %>% | |
count(gender, word) %>% | |
bind_tf_idf(word, gender, n) %>% | |
arrange(desc(tf_idf)) | |
## arrange by tf_idf | |
gu_tf_idf <- gu_tf_idf %>% | |
arrange(desc(tf_idf)) %>% | |
mutate(word = factor(word, levels = rev(unique(word)))) | |
## plot tf_idf by gender | |
gu_tf_idf %>% | |
group_by(gender) %>% | |
top_n(15) %>% | |
ungroup %>% | |
ggplot(aes(word, tf_idf, fill = gender)) + | |
geom_col(show.legend = FALSE) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_wrap(~gender, ncol = 2, scales = "free") + | |
coord_flip() | |
## Sentiment | |
nrcjoy <- get_sentiments("nrc") %>% | |
filter(sentiment == "joy") | |
genders_united %>% | |
inner_join(nrcjoy) %>% | |
group_by(gender) %>% | |
count(word, sort = TRUE) | |
both_gender_word_counts <- genders_united %>% | |
group_by(gender) %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = TRUE) | |
both_gender_word_counts %>% | |
filter(gender == 'F') %>% | |
group_by(sentiment) %>% | |
top_n(10) %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n, fill = sentiment)) + | |
geom_col(show.legend = FALSE) + | |
facet_wrap(~sentiment, scales = "free_y") + | |
labs(y = "Contribution to sentiment", | |
x = NULL) + | |
coord_flip() | |
both_gender_word_counts %>% | |
filter(gender == 'M') %>% | |
group_by(sentiment) %>% | |
top_n(10) %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n, fill = sentiment)) + | |
geom_col(show.legend = FALSE) + | |
facet_wrap(~sentiment, scales = "free_y") + | |
labs(y = "Contribution to sentiment", | |
x = NULL) + | |
coord_flip() | |
library(igraph) | |
frequency <- genders_united %>% | |
mutate(word = str_extract(word, "[a-z']+")) %>% | |
count(gender, word) %>% | |
mutate(proportion = n / sum(n)) | |
freq_for_graph <- frequency %>% | |
filter(proportion >= .002) %>% | |
graph_from_data_frame() | |
## Visualize a network graph of words by gender | |
library(ggraph) | |
set.seed(2121) | |
a <- grid::arrow(type = "closed", length = unit(.15, "inches")) | |
ggraph(freq_for_graph, layout = "fr") + | |
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, | |
arrow = a, end_cap = circle(.07, 'inches')) + | |
geom_node_point(color = "lightblue", size = 5) + | |
geom_node_text(aes(label = name), vjust = 1, hjust = 1) + | |
theme_void() | |
#### RANDOM STUFF #### | |
gender_sentiment <- genders_united %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(gender, sentiment) %>% | |
spread(sentiment, n, fill = 0) %>% | |
mutate(sentiment = positive - negative) | |
frequency <- bind_rows(mutate(f_1grams, gender = "F"), | |
mutate(m_1grams, gender = "M"), | |
mutate(genders_united, gender="B")) %>% | |
mutate(word = str_extract(word, "[a-z']+")) %>% | |
count(gender, word) %>% | |
group_by(gender) %>% | |
mutate(proportion = n / sum(n)) %>% | |
select(-n) %>% | |
spread(gender, proportion) %>% | |
gather(gender, proportion, M:F) | |
library(scales) | |
# expect a warning about rows with missing values being removed | |
ggplot(frequency, aes(x = proportion, y = B, color = abs(B - proportion))) + | |
geom_abline(color = "gray40", lty = 2) + | |
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + | |
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + | |
scale_x_log10(labels = percent_format()) + | |
scale_y_log10(labels = percent_format()) + | |
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") + | |
facet_wrap(~gender, ncol = 2) + | |
theme(legend.position="none") + | |
labs(y = "Both Genders", x = NULL) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment