tcash21/full_openvis_analysis.R

## full_openvis_analysis.R
library(rvest)
library(tidytext)
library(stringr)
library(purrr)
library(ggplot2)
library(tidyr)

options(stringsAsFactors = FALSE)

## use phantomjs to download the webpage content (javascript-generated)
system("phantomjs scrape.js")
h <- html("openviz.txt")

## get all <a> tags
links <- h %>% html_nodes('a')

## map them to a data.frame using purrr
links_df <-
  links %>%
  map(xml_attrs) %>%
  map_df(~as.list(.))

## grep out just the transcript URLs
transcript_urls <- links_df[grep("transcripts", links_df$href),]$href

## store them into a list
base_url <- 'https://openvisconf.com'
transcripts <- paste0(base_url, transcript_urls)
transcript_txts<-sapply(transcripts, function(x) readLines(x, encoding = "UTF-8"))

## Assign genders to the speakers
genders<-data.frame(transcript = transcripts, gender="M")
genders[grep("Shirley|Amanda|Lisa|Julia|Amy|Amelia", genders$transcript),]$gender <- 'F'

## Remove this talk because it has male and female speakers and no way to differentiate
i <- grep("Ignazio", genders$transcript)
genders <- genders[-i,]
transcript_txts <- transcript_txts[-i]

## separate transcripts out by gender
female_dfs <- lapply(transcript_txts[genders$gender == 'F'], function(x) data_frame(txt=x))
male_dfs <- lapply(transcript_txts[genders$gender == 'M'], function(x) data_frame(txt=x))

female_dfs_all <- do.call('rbind', female_dfs)
male_dfs_all <- do.call('rbind', male_dfs)

data(stop_words)

## 1grams for women
f_1grams <-
  female_dfs_all %>%
  unnest_tokens(word, txt) %>%
  anti_join(stop_words)

## top 10 1grams for women
f_top10 <- f_1grams %>%
  count(word, sort = TRUE) %>%
  mutate(freq = n/as.integer(count(f_1grams))) %>%
  head(10)

## Plot most commonly used words by women speakers
f_1grams %>%
  count(word, sort = TRUE) %>%
  filter(n > 25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

## 1grams for men
m_1grams <-
  male_dfs_all %>%
  unnest_tokens(word, txt) %>%
  anti_join(stop_words)

## top 10 1grams by men
m_top10 <- m_1grams %>%
  count(word, sort = TRUE) %>%
  mutate(freq = n/as.integer(count(m_1grams))) %>%
  head(10)

## Plot most commonly used words by male speakers
m_1grams %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

## Genders UNITE!
genders_united <- bind_rows(mutate(f_1grams, gender = "F"),
                            mutate(m_1grams, gender = "M"))

genders_united %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

## calculate tf_idf
gu_tf_idf <- genders_united %>%
  count(gender, word) %>%
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))

## arrange by tf_idf
gu_tf_idf <- gu_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word))))

## plot tf_idf by gender
gu_tf_idf %>%
  group_by(gender) %>%
  top_n(15) %>%
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, ncol = 2, scales = "free") +
  coord_flip()


## Sentiment
nrcjoy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

genders_united %>%
  inner_join(nrcjoy) %>%
  group_by(gender) %>%
  count(word, sort = TRUE)

both_gender_word_counts <- genders_united %>%
  group_by(gender) %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE)

both_gender_word_counts %>%
  filter(gender == 'F') %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

both_gender_word_counts %>%
  filter(gender == 'M') %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

library(igraph)

frequency <- genders_united %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(gender, word) %>%
  mutate(proportion = n / sum(n))

freq_for_graph <- frequency %>%
  filter(proportion >= .002) %>%
  graph_from_data_frame()

## Visualize a network graph of words by gender
library(ggraph)
set.seed(2121)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(freq_for_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()


#### RANDOM STUFF ####

gender_sentiment <- genders_united %>%
  inner_join(get_sentiments("bing")) %>%
  count(gender, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)


frequency <- bind_rows(mutate(f_1grams, gender = "F"),
                       mutate(m_1grams, gender = "M"),
                       mutate(genders_united, gender="B")) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>%
  select(-n) %>%
  spread(gender, proportion) %>%
  gather(gender, proportion, M:F)

library(scales)

# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = B, color = abs(B - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~gender, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Both Genders", x = NULL)
	library(rvest)
	library(tidytext)
	library(stringr)
	library(purrr)
	library(ggplot2)
	library(tidyr)

	options(stringsAsFactors = FALSE)

	## use phantomjs to download the webpage content (javascript-generated)
	system("phantomjs scrape.js")
	h <- html("openviz.txt")

	## get all <a> tags
	links <- h %>% html_nodes('a')

	## map them to a data.frame using purrr
	links_df <-
	links %>%
	map(xml_attrs) %>%
	map_df(~as.list(.))

	## grep out just the transcript URLs
	transcript_urls <- links_df[grep("transcripts", links_df$href),]$href

	## store them into a list
	base_url <- 'https://openvisconf.com'
	transcripts <- paste0(base_url, transcript_urls)
	transcript_txts<-sapply(transcripts, function(x) readLines(x, encoding = "UTF-8"))

	## Assign genders to the speakers
	genders<-data.frame(transcript = transcripts, gender="M")
	genders[grep("Shirley\|Amanda\|Lisa\|Julia\|Amy\|Amelia", genders$transcript),]$gender <- 'F'

	## Remove this talk because it has male and female speakers and no way to differentiate
	i <- grep("Ignazio", genders$transcript)
	genders <- genders[-i,]
	transcript_txts <- transcript_txts[-i]

	## separate transcripts out by gender
	female_dfs <- lapply(transcript_txts[genders$gender == 'F'], function(x) data_frame(txt=x))
	male_dfs <- lapply(transcript_txts[genders$gender == 'M'], function(x) data_frame(txt=x))

	female_dfs_all <- do.call('rbind', female_dfs)
	male_dfs_all <- do.call('rbind', male_dfs)

	data(stop_words)

	## 1grams for women
	f_1grams <-
	female_dfs_all %>%
	unnest_tokens(word, txt) %>%
	anti_join(stop_words)

	## top 10 1grams for women
	f_top10 <- f_1grams %>%
	count(word, sort = TRUE) %>%
	mutate(freq = n/as.integer(count(f_1grams))) %>%
	head(10)

	## Plot most commonly used words by women speakers
	f_1grams %>%
	count(word, sort = TRUE) %>%
	filter(n > 25) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n)) +
	geom_col() +
	xlab(NULL) +
	coord_flip()

	## 1grams for men
	m_1grams <-
	male_dfs_all %>%
	unnest_tokens(word, txt) %>%
	anti_join(stop_words)

	## top 10 1grams by men
	m_top10 <- m_1grams %>%
	count(word, sort = TRUE) %>%
	mutate(freq = n/as.integer(count(m_1grams))) %>%
	head(10)

	## Plot most commonly used words by male speakers
	m_1grams %>%
	count(word, sort = TRUE) %>%
	filter(n > 50) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n)) +
	geom_col() +
	xlab(NULL) +
	coord_flip()

	## Genders UNITE!
	genders_united <- bind_rows(mutate(f_1grams, gender = "F"),
	mutate(m_1grams, gender = "M"))

	genders_united %>%
	count(word, sort = TRUE) %>%
	filter(n > 50) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n)) +
	geom_col() +
	xlab(NULL) +
	coord_flip()

	## calculate tf_idf
	gu_tf_idf <- genders_united %>%
	count(gender, word) %>%
	bind_tf_idf(word, gender, n) %>%
	arrange(desc(tf_idf))

	## arrange by tf_idf
	gu_tf_idf <- gu_tf_idf %>%
	arrange(desc(tf_idf)) %>%
	mutate(word = factor(word, levels = rev(unique(word))))

	## plot tf_idf by gender
	gu_tf_idf %>%
	group_by(gender) %>%
	top_n(15) %>%
	ungroup %>%
	ggplot(aes(word, tf_idf, fill = gender)) +
	geom_col(show.legend = FALSE) +
	labs(x = NULL, y = "tf-idf") +
	facet_wrap(~gender, ncol = 2, scales = "free") +
	coord_flip()


	## Sentiment
	nrcjoy <- get_sentiments("nrc") %>%
	filter(sentiment == "joy")

	genders_united %>%
	inner_join(nrcjoy) %>%
	group_by(gender) %>%
	count(word, sort = TRUE)

	both_gender_word_counts <- genders_united %>%
	group_by(gender) %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE)

	both_gender_word_counts %>%
	filter(gender == 'F') %>%
	group_by(sentiment) %>%
	top_n(10) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n, fill = sentiment)) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~sentiment, scales = "free_y") +
	labs(y = "Contribution to sentiment",
	x = NULL) +
	coord_flip()

	both_gender_word_counts %>%
	filter(gender == 'M') %>%
	group_by(sentiment) %>%
	top_n(10) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n, fill = sentiment)) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~sentiment, scales = "free_y") +
	labs(y = "Contribution to sentiment",
	x = NULL) +
	coord_flip()

	library(igraph)

	frequency <- genders_united %>%
	mutate(word = str_extract(word, "[a-z']+")) %>%
	count(gender, word) %>%
	mutate(proportion = n / sum(n))

	freq_for_graph <- frequency %>%
	filter(proportion >= .002) %>%
	graph_from_data_frame()

	## Visualize a network graph of words by gender
	library(ggraph)
	set.seed(2121)

	a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

	ggraph(freq_for_graph, layout = "fr") +
	geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
	arrow = a, end_cap = circle(.07, 'inches')) +
	geom_node_point(color = "lightblue", size = 5) +
	geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
	theme_void()






	#### RANDOM STUFF ####

	gender_sentiment <- genders_united %>%
	inner_join(get_sentiments("bing")) %>%
	count(gender, sentiment) %>%
	spread(sentiment, n, fill = 0) %>%
	mutate(sentiment = positive - negative)



	frequency <- bind_rows(mutate(f_1grams, gender = "F"),
	mutate(m_1grams, gender = "M"),
	mutate(genders_united, gender="B")) %>%
	mutate(word = str_extract(word, "[a-z']+")) %>%
	count(gender, word) %>%
	group_by(gender) %>%
	mutate(proportion = n / sum(n)) %>%
	select(-n) %>%
	spread(gender, proportion) %>%
	gather(gender, proportion, M:F)

	library(scales)

	# expect a warning about rows with missing values being removed
	ggplot(frequency, aes(x = proportion, y = B, color = abs(B - proportion))) +
	geom_abline(color = "gray40", lty = 2) +
	geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
	geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
	scale_x_log10(labels = percent_format()) +
	scale_y_log10(labels = percent_format()) +
	scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
	facet_wrap(~gender, ncol = 2) +
	theme(legend.position="none") +
	labs(y = "Both Genders", x = NULL)