Last active
March 11, 2019 04:51
-
-
Save erikgregorywebb/50156c5f9eccb733cf5d4aeb6efc233f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# jpm shareholder letter text analysis | |
library(rvest) | |
library(dplyr) | |
library(tidytext) | |
library(ggplot2) | |
library(tidyr) | |
library(stringr) | |
library(scales) | |
library(plotly) | |
set.seed(2016) | |
# function to extract text | |
extract_text = function(url) { | |
page = read_html(url) | |
text = page %>% html_node('.full-copyarea') %>% html_text() | |
return(text) | |
} | |
# colors | |
corp_color_1 = rgb(109, 110, 106, maxColorValue = 255) # grey | |
corp_color_2 = rgb(71, 143, 191, maxColorValue = 255) # light blue | |
corp_color_3 = rgb(166, 182, 64, maxColorValue = 255) # green | |
corp_color_4 = rgb(0, 105, 163, maxColorValue = 255) # green | |
# extract text, 2015-2017 | |
letter_2017 = extract_text('https://reports.jpmorganchase.com/investor-relations/2017/ar-ceo-letters.htm') | |
letter_2016 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2016/') | |
letter_2015 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2015/') | |
# extract text, 2014 | |
letter_2014_1 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-introduction.htm') | |
letter_2014_2 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-outstanding-franchise.htm') | |
letter_2014_3 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-built-for-the-long-term.htm') | |
letter_2014_4 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-global-financial-architecture.htm') | |
letter_2014_5 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-solid-strategy.htm') | |
letter_2014_6 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-strong-corporate-culture.htm') | |
letter_2014 = c(letter_2014_1, letter_2014_2, letter_2014_3, letter_2014_4, letter_2014_5, letter_2014_6) | |
letter_2014 = paste(letter_2014, collapse = ' ') | |
# combine | |
letters = bind_rows( | |
letter_2017 %>% as_data_frame() %>% mutate(year = '2017'), | |
letter_2016 %>% as_data_frame() %>% mutate(year = '2016'), | |
letter_2015 %>% as_data_frame() %>% mutate(year = '2015'), | |
letter_2014 %>% as_data_frame() %>% mutate(year = '2014')) | |
letters_sentences = bind_rows( | |
letter_2017 %>% as_data_frame() %>% mutate(year = '2017') | |
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()), | |
letter_2016 %>% as_data_frame() %>% mutate(year = '2016') | |
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()), | |
letter_2015 %>% as_data_frame() %>% mutate(year = '2015') | |
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()), | |
letter_2014 %>% as_data_frame() %>% mutate(year = '2014') | |
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number())) | |
# unnest | |
letters_words = letters_sentences %>% | |
unnest_tokens(word, sentence) %>% | |
anti_join(stop_words) %>% | |
filter(!str_detect(word, '^[0-9]')) %>% | |
select(year, sentence = sentence_no, word) | |
# number of words, unique words by year | |
bind_rows( | |
letters_words %>% group_by(year) %>% count(sort = TRUE) %>% mutate(type = 'Total Words'), | |
letters_words %>% group_by(year, word) %>% count(sort = TRUE) %>% ungroup() %>% | |
group_by(year) %>% count(sort = TRUE) %>% select(year, n = nn) %>% mutate(type = 'Unique Words') | |
) %>% | |
ungroup() %>% | |
mutate(year = as.numeric(year)) %>% | |
ggplot(., aes(x = year, y = n, color = type, text = year)) + | |
geom_line(size = 3, linetype = 1) + | |
geom_point(aes(text = sprintf("letter: %s<br>Letter: %s", year, n))) + | |
scale_color_manual(values = c(corp_color_2, corp_color_3)) + | |
scale_y_continuous(label = comma, limits = c(0, 10000)) + | |
theme_minimal() + | |
theme(axis.title.x = element_blank(), | |
axis.title.y = element_blank(), | |
legend.title = element_blank()) + | |
labs(title = 'Total & Unique Number of Words', | |
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017') | |
# top 20 words overall | |
letters_words %>% | |
group_by(word) %>% | |
count(sort = TRUE) %>% | |
ungroup() %>% | |
top_n(n = 10) %>% | |
arrange(n) %>% | |
mutate(word = factor(word, unique(word))) %>% | |
ungroup() %>% | |
ggplot(aes(word, n)) + | |
geom_col(show.legend = FALSE, fill = corp_color_2) + | |
coord_flip() + | |
theme_minimal() + | |
theme(axis.title.y = element_blank()) + | |
labs(y = 'Count', | |
title = 'Top 10 Most Common Words', | |
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017') | |
# sentence-level sentiment | |
letters_words %>% | |
inner_join(get_sentiments('afinn')) %>% | |
group_by(year, index = sentence %/% 20) %>% | |
summarise(sentiment = sum(score)) %>% | |
ggplot(., aes(index, sentiment, fill = year)) + | |
geom_col(show.legend = FALSE) + | |
scale_fill_manual(values = c(corp_color_1, corp_color_2, corp_color_3, corp_color_4)) + | |
facet_wrap(~ year, ncol = 2, scales = "free_y") + | |
scale_y_continuous(limits = c(-50, 75)) + | |
theme_minimal() + | |
labs(x = 'Text Block', | |
y = 'Sentiment Score', | |
title = 'Average Sentiment over Block by Year', | |
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017') | |
# pull out sample text from sections with very high or very low sentiment | |
letters_words %>% | |
inner_join(get_sentiments('afinn')) %>% | |
group_by(year, sentence) %>% | |
summarise(sentiment = sum(score)) %>% | |
ungroup() %>% | |
bottom_n(5) | |
letters_sentences %>% | |
filter(year == '2014') %>% | |
filter(sentence_no == 155) %>% | |
select(sentence) %>% as.character() | |
letters_words %>% | |
inner_join(get_sentiments('afinn')) %>% | |
group_by(year, sentence) %>% | |
summarise(sentiment = sum(score)) %>% | |
ungroup() %>% | |
top_n(3, -sentiment) | |
letters_sentences %>% | |
filter(year == '2014') %>% | |
filter(sentence_no == 545) %>% | |
select(sentence) %>% as.character() | |
letters_words %>% | |
inner_join(get_sentiments('afinn')) %>% | |
group_by(year, sentence) %>% | |
summarise(sentiment = sum(score)) %>% | |
filter(year == '2014') %>% | |
filter(sentence == 545) | |
# top contributing words to sentiment, 2017 | |
letters_words %>% | |
filter(year == 2017) %>% | |
inner_join(get_sentiments('bing')) %>% | |
count(word, sentiment, sort = TRUE) %>% | |
ungroup() %>% | |
group_by(sentiment) %>% | |
top_n(10) %>% | |
ungroup() %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n, fill = sentiment)) + | |
geom_col(show.legend = FALSE) + | |
scale_fill_manual(values = c(corp_color_2, corp_color_3)) + | |
facet_wrap(~sentiment, scales = "free_y", ncol = 4) + | |
coord_flip() + | |
theme_minimal() + | |
labs(x = NULL, | |
y = 'Contribution to Sentiment', | |
title = 'Top 10 Words by Contribution to Sentiment', | |
subtitle = 'Jamie Dimon Shareholder 2017 Letter') | |
# top contributing words to sentiment by year | |
letters_words %>% | |
inner_join(get_sentiments('bing')) %>% | |
count(year, word, sentiment, sort = TRUE) %>% | |
ungroup() %>% | |
group_by(year, sentiment) %>% | |
top_n(10) %>% | |
ungroup() %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n, fill = sentiment)) + | |
geom_col(show.legend = FALSE) + | |
scale_fill_manual(values = c(corp_color_2, corp_color_3)) + | |
facet_wrap(year ~ sentiment, scales = "free_y", ncol = 4) + | |
coord_flip() + | |
theme_minimal() + | |
labs(x = NULL, | |
y = 'Contribution to Sentiment', | |
title = 'Top 10 Words by Contribution to Sentiment by Year', | |
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017') | |
# tf-idf | |
letters_words %>% | |
group_by(year, word) %>% | |
count(sort = TRUE) %>% | |
bind_tf_idf(word, year, n) %>% | |
arrange(desc(tf_idf)) %>% | |
ungroup() %>% | |
mutate(word = factor(word, levels = rev(unique(word)))) %>% | |
group_by(year) %>% | |
top_n(10) %>% | |
ungroup %>% | |
arrange(year, desc(tf_idf)) %>% | |
filter(year == '2016') %>% | |
ggplot(aes(word, tf_idf, fill = year)) + | |
geom_col(show.legend = FALSE) + | |
scale_fill_manual(values = c(corp_color_4)) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_wrap(~year, ncol = 2, scales = "free") + | |
coord_flip() + | |
theme_minimal() + | |
labs(x = NULL, | |
y = 'td-idf', | |
title = 'Top 10 TF-IDF Scores', | |
subtitle = 'Jamie Dimon Shareholder Letters, 2016') | |
# bigram tf-idf | |
letters_sentences %>% | |
unnest_tokens(bigram, sentence, token = "ngrams", n = 2) %>% | |
separate(bigram, c("word1", "word2"), sep = " ") %>% | |
filter(!word1 %in% stop_words$word, | |
!word2 %in% stop_words$word) %>% | |
filter(!str_detect(word1, '^[0-9]'), | |
!str_detect(word2, '^[0-9]')) %>% | |
unite(bigram, word1, word2, sep = ' ') %>% | |
count(year, bigram) %>% | |
bind_tf_idf(bigram, year, n) %>% | |
arrange(desc(tf_idf)) %>% | |
filter(year == '2017') %>% | |
ungroup() %>% | |
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% | |
group_by(year) %>% | |
top_n(9) %>% | |
ungroup %>% | |
ggplot(aes(bigram, tf_idf, fill = year)) + | |
geom_col(show.legend = FALSE) + | |
scale_fill_manual(values = c(corp_color_3)) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_wrap(~year, ncol = 2, scales = "free") + | |
coord_flip() + | |
theme_minimal() + | |
labs(x = NULL, | |
y = 'td-idf', | |
title = 'Top 9 Bigram TF-IDF Scores', | |
subtitle = 'Jamie Dimon Shareholder Letters, 2017') | |
# trigram tf-idf | |
letters_sentences %>% | |
unnest_tokens(trigram, sentence, token = "ngrams", n = 3) %>% | |
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% | |
filter(!word1 %in% stop_words$word, | |
!word2 %in% stop_words$word, | |
!word3 %in% stop_words$word) %>% | |
filter(!str_detect(word1, '^[0-9]'), | |
!str_detect(word2, '^[0-9]'), | |
!str_detect(word3, '^[0-9]')) %>% | |
unite(trigram, word1, word2, word3, sep = ' ') %>% | |
count(year, trigram) %>% | |
bind_tf_idf(trigram, year, n) %>% | |
arrange(desc(tf_idf)) %>% | |
ungroup() %>% | |
mutate(trigram = factor(trigram, levels = rev(unique(trigram)))) %>% | |
group_by(year) %>% | |
top_n(8) %>% | |
ungroup %>% | |
ggplot(aes(trigram, tf_idf, fill = year)) + | |
geom_col(show.legend = FALSE) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_wrap(~year, ncol = 2, scales = "free") + | |
coord_flip() | |
# bigram count | |
letters_sentences %>% | |
unnest_tokens(bigram, sentence, token = "ngrams", n = 2) %>% | |
separate(bigram, c("word1", "word2"), sep = " ") %>% | |
filter(!word1 %in% stop_words$word, | |
!word2 %in% stop_words$word) %>% | |
filter(!str_detect(word1, '^[0-9]'), | |
!str_detect(word2, '^[0-9]')) %>% | |
filter(word1 != 'jpmorgan', word2 != 'chase') %>% | |
unite(bigram, word1, word2, sep = ' ') %>% | |
count(year, bigram) %>% | |
arrange(desc(n)) %>% | |
ungroup() %>% | |
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% | |
group_by(year) %>% | |
top_n(8) %>% | |
ungroup %>% | |
ggplot(aes(bigram, n, fill = year)) + | |
geom_col(show.legend = FALSE) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_wrap(~year, ncol = 2, scales = "free") + | |
coord_flip() | |
# trigram count | |
letters_sentences %>% | |
unnest_tokens(trigram, sentence, token = "ngrams", n = 3) %>% | |
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% | |
filter(!word1 %in% stop_words$word, | |
!word2 %in% stop_words$word, | |
!word3 %in% stop_words$word) %>% | |
filter(!str_detect(word1, '^[0-9]'), | |
!str_detect(word2, '^[0-9]'), | |
!str_detect(word3, '^[0-9]')) %>% | |
unite(trigram, word1, word2, word3, sep = ' ') %>% | |
count(year, trigram) %>% | |
arrange(desc(n)) %>% | |
ungroup() %>% | |
mutate(trigram = factor(trigram, levels = rev(unique(trigram)))) %>% | |
group_by(year) %>% | |
top_n(8) %>% | |
ungroup %>% | |
ggplot(aes(trigram, n, fill = year)) + | |
geom_col(show.legend = FALSE) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_wrap(~year, ncol = 2, scales = "free") + | |
coord_flip() | |
library(igraph) | |
library(ggraph) | |
# bigram network (final) | |
p10 = letters_sentences %>% | |
unnest_tokens(bigram, sentence, token = "ngrams", n = 2) %>% | |
separate(bigram, c("word1", "word2"), sep = " ") %>% | |
filter(!word1 %in% stop_words$word, | |
!word2 %in% stop_words$word) %>% | |
filter(!str_detect(word1, '^[0-9]'), | |
!str_detect(word2, '^[0-9]')) %>% | |
filter(word1 != 'jpmorgan', word2 != 'chase') %>% | |
count(word1, word2, sort = TRUE) %>% | |
filter(n > 8) %>% | |
graph_from_data_frame() %>% | |
ggraph(layout = "fr") + | |
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = corp_color_3) + | |
geom_node_point(size = 2) + | |
geom_node_text(aes(label = name), repel = TRUE, | |
point.padding = unit(0.2, "lines")) + | |
theme_void() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment