earino/example_aoc.R

## example_aoc.R
library(twitteR)
library(tidyverse)
library(tidytext)


setup_twitter_oauth(
  consumer_key = Sys.getenv("TWITTER_CONSUMER_KEY"),
  consumer_secret = Sys.getenv("TWITTER_CONSUMER_SECRET"),
  access_token = Sys.getenv("TWITTER_ACCESS_TOKEN"),
  access_secret = Sys.getenv("TWITTER_ACCESS_SECRET")
)

trump <- userTimeline('realDonaldTrump', n = 3200)
aoc <- userTimeline('AOC', n = 3200)

raw_tweets <- bind_rows(twListToDF(trump), twListToDF(aoc))

words <- raw_tweets %>%
  unnest_tokens(word, text)
data("stop_words")

words <- words %>%
  anti_join(stop_words, by = "word") %>%
  filter(! str_detect(word, "\\d"))

words_to_ignore <- data_frame(word = c("https", "amp", "t.co"))

words <- words %>%
  anti_join(words_to_ignore, by = "word")

tweets <- words %>%
  group_by(screenName, id, word) %>%
  summarise(contains = 1) %>%
  ungroup() %>%
  spread(key = word, value = contains, fill = 0) %>%
  mutate(tweet_by_trump = as.integer(screenName == "realDonaldTrump")) %>%
  select(-screenName, -id)

library(glmnet)

fit <- cv.glmnet(
  x = tweets %>% select(-tweet_by_trump) %>% as.matrix(),
  y = tweets$tweet_by_trump,
  family = "binomial"
)

temp <- coef(fit, s = exp(-3)) %>% as.matrix()
coefficients <- data.frame(word = row.names(temp), beta = temp[, 1])
data <- coefficients %>%
  filter(beta != 0) %>%
  filter(word != "(Intercept)") %>%
  arrange(desc(beta)) %>%
  mutate(i = row_number())

ggplot(data, aes(x = i, y = beta, fill = ifelse(beta > 0, "Trump", "AOC"))) +
  geom_bar(stat = "identity", alpha = 0.75) +
  scale_x_continuous(breaks = data$i, labels = data$word, minor_breaks = NULL) +
  xlab("") +
  ylab("Coefficient Estimate") +
  coord_flip() +
  scale_fill_manual(
    guide = guide_legend(title = "Word typically used by:"),
    values = c("#446093", "#bc3939")
  ) +
  theme_bw() +
  theme(legend.position = "top")

library(wordcloud)

words %>%
  filter(screenName == "realDonaldTrump") %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 20))

words %>%
  filter(screenName == "AOC") %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 10))

ggplot(raw_tweets, aes(x = created, y = screenName)) +
  geom_jitter(width = 0) +
  theme_bw() +
  ylab("") +
  xlab("")
	library(twitteR)
	library(tidyverse)
	library(tidytext)


	setup_twitter_oauth(
	consumer_key = Sys.getenv("TWITTER_CONSUMER_KEY"),
	consumer_secret = Sys.getenv("TWITTER_CONSUMER_SECRET"),
	access_token = Sys.getenv("TWITTER_ACCESS_TOKEN"),
	access_secret = Sys.getenv("TWITTER_ACCESS_SECRET")
	)

	trump <- userTimeline('realDonaldTrump', n = 3200)
	aoc <- userTimeline('AOC', n = 3200)

	raw_tweets <- bind_rows(twListToDF(trump), twListToDF(aoc))

	words <- raw_tweets %>%
	unnest_tokens(word, text)
	data("stop_words")

	words <- words %>%
	anti_join(stop_words, by = "word") %>%
	filter(! str_detect(word, "\\d"))

	words_to_ignore <- data_frame(word = c("https", "amp", "t.co"))

	words <- words %>%
	anti_join(words_to_ignore, by = "word")

	tweets <- words %>%
	group_by(screenName, id, word) %>%
	summarise(contains = 1) %>%
	ungroup() %>%
	spread(key = word, value = contains, fill = 0) %>%
	mutate(tweet_by_trump = as.integer(screenName == "realDonaldTrump")) %>%
	select(-screenName, -id)

	library(glmnet)

	fit <- cv.glmnet(
	x = tweets %>% select(-tweet_by_trump) %>% as.matrix(),
	y = tweets$tweet_by_trump,
	family = "binomial"
	)

	temp <- coef(fit, s = exp(-3)) %>% as.matrix()
	coefficients <- data.frame(word = row.names(temp), beta = temp[, 1])
	data <- coefficients %>%
	filter(beta != 0) %>%
	filter(word != "(Intercept)") %>%
	arrange(desc(beta)) %>%
	mutate(i = row_number())

	ggplot(data, aes(x = i, y = beta, fill = ifelse(beta > 0, "Trump", "AOC"))) +
	geom_bar(stat = "identity", alpha = 0.75) +
	scale_x_continuous(breaks = data$i, labels = data$word, minor_breaks = NULL) +
	xlab("") +
	ylab("Coefficient Estimate") +
	coord_flip() +
	scale_fill_manual(
	guide = guide_legend(title = "Word typically used by:"),
	values = c("#446093", "#bc3939")
	) +
	theme_bw() +
	theme(legend.position = "top")

	library(wordcloud)

	words %>%
	filter(screenName == "realDonaldTrump") %>%
	count(word) %>%
	with(wordcloud(word, n, max.words = 20))

	words %>%
	filter(screenName == "AOC") %>%
	count(word) %>%
	with(wordcloud(word, n, max.words = 10))

	ggplot(raw_tweets, aes(x = created, y = screenName)) +
	geom_jitter(width = 0) +
	theme_bw() +
	ylab("") +
	xlab("")