Skip to content

Instantly share code, notes, and snippets.

@tcash21
Created May 13, 2017 15:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tcash21/826e167d56e17b505cce519162c7028b to your computer and use it in GitHub Desktop.
Save tcash21/826e167d56e17b505cce519162c7028b to your computer and use it in GitHub Desktop.
OpenVisConf Transcript Analysis
library(rvest)
library(tidytext)
library(stringr)
library(purrr)
library(ggplot2)
library(tidyr)
options(stringsAsFactors = FALSE)
## use phantomjs to download the webpage content (javascript-generated)
system("phantomjs scrape.js")
h <- html("openviz.txt")
## get all <a> tags
links <- h %>% html_nodes('a')
## map them to a data.frame using purrr
links_df <-
links %>%
map(xml_attrs) %>%
map_df(~as.list(.))
## grep out just the transcript URLs
transcript_urls <- links_df[grep("transcripts", links_df$href),]$href
## store them into a list
base_url <- 'https://openvisconf.com'
transcripts <- paste0(base_url, transcript_urls)
transcript_txts<-sapply(transcripts, function(x) readLines(x, encoding = "UTF-8"))
## Assign genders to the speakers
genders<-data.frame(transcript = transcripts, gender="M")
genders[grep("Shirley|Amanda|Lisa|Julia|Amy|Amelia", genders$transcript),]$gender <- 'F'
## Remove this talk because it has male and female speakers and no way to differentiate
i <- grep("Ignazio", genders$transcript)
genders <- genders[-i,]
transcript_txts <- transcript_txts[-i]
## separate transcripts out by gender
female_dfs <- lapply(transcript_txts[genders$gender == 'F'], function(x) data_frame(txt=x))
male_dfs <- lapply(transcript_txts[genders$gender == 'M'], function(x) data_frame(txt=x))
female_dfs_all <- do.call('rbind', female_dfs)
male_dfs_all <- do.call('rbind', male_dfs)
data(stop_words)
## 1grams for women
f_1grams <-
female_dfs_all %>%
unnest_tokens(word, txt) %>%
anti_join(stop_words)
## top 10 1grams for women
f_top10 <- f_1grams %>%
count(word, sort = TRUE) %>%
mutate(freq = n/as.integer(count(f_1grams))) %>%
head(10)
## Plot most commonly used words by women speakers
f_1grams %>%
count(word, sort = TRUE) %>%
filter(n > 25) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
## 1grams for men
m_1grams <-
male_dfs_all %>%
unnest_tokens(word, txt) %>%
anti_join(stop_words)
## top 10 1grams by men
m_top10 <- m_1grams %>%
count(word, sort = TRUE) %>%
mutate(freq = n/as.integer(count(m_1grams))) %>%
head(10)
## Plot most commonly used words by male speakers
m_1grams %>%
count(word, sort = TRUE) %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
## Genders UNITE!
genders_united <- bind_rows(mutate(f_1grams, gender = "F"),
mutate(m_1grams, gender = "M"))
genders_united %>%
count(word, sort = TRUE) %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
## calculate tf_idf
gu_tf_idf <- genders_united %>%
count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))
## arrange by tf_idf
gu_tf_idf <- gu_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word))))
## plot tf_idf by gender
gu_tf_idf %>%
group_by(gender) %>%
top_n(15) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, ncol = 2, scales = "free") +
coord_flip()
## Sentiment
nrcjoy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
genders_united %>%
inner_join(nrcjoy) %>%
group_by(gender) %>%
count(word, sort = TRUE)
both_gender_word_counts <- genders_united %>%
group_by(gender) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)
both_gender_word_counts %>%
filter(gender == 'F') %>%
group_by(sentiment) %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
both_gender_word_counts %>%
filter(gender == 'M') %>%
group_by(sentiment) %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
library(igraph)
frequency <- genders_united %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(gender, word) %>%
mutate(proportion = n / sum(n))
freq_for_graph <- frequency %>%
filter(proportion >= .002) %>%
graph_from_data_frame()
## Visualize a network graph of words by gender
library(ggraph)
set.seed(2121)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(freq_for_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
#### RANDOM STUFF ####
gender_sentiment <- genders_united %>%
inner_join(get_sentiments("bing")) %>%
count(gender, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
frequency <- bind_rows(mutate(f_1grams, gender = "F"),
mutate(m_1grams, gender = "M"),
mutate(genders_united, gender="B")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(gender, proportion) %>%
gather(gender, proportion, M:F)
library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = B, color = abs(B - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~gender, ncol = 2) +
theme(legend.position="none") +
labs(y = "Both Genders", x = NULL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment