Skip to content

Instantly share code, notes, and snippets.

@erikgregorywebb
Last active March 11, 2019 04:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikgregorywebb/50156c5f9eccb733cf5d4aeb6efc233f to your computer and use it in GitHub Desktop.
Save erikgregorywebb/50156c5f9eccb733cf5d4aeb6efc233f to your computer and use it in GitHub Desktop.
# jpm shareholder letter text analysis
library(rvest)
library(dplyr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(stringr)
library(scales)
library(plotly)
set.seed(2016)
# function to extract text
extract_text = function(url) {
page = read_html(url)
text = page %>% html_node('.full-copyarea') %>% html_text()
return(text)
}
# colors
corp_color_1 = rgb(109, 110, 106, maxColorValue = 255) # grey
corp_color_2 = rgb(71, 143, 191, maxColorValue = 255) # light blue
corp_color_3 = rgb(166, 182, 64, maxColorValue = 255) # green
corp_color_4 = rgb(0, 105, 163, maxColorValue = 255) # green
# extract text, 2015-2017
letter_2017 = extract_text('https://reports.jpmorganchase.com/investor-relations/2017/ar-ceo-letters.htm')
letter_2016 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2016/')
letter_2015 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2015/')
# extract text, 2014
letter_2014_1 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-introduction.htm')
letter_2014_2 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-outstanding-franchise.htm')
letter_2014_3 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-built-for-the-long-term.htm')
letter_2014_4 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-global-financial-architecture.htm')
letter_2014_5 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-solid-strategy.htm')
letter_2014_6 = extract_text('https://www.jpmorganchase.com/corporate/annual-report/2014/ar-strong-corporate-culture.htm')
letter_2014 = c(letter_2014_1, letter_2014_2, letter_2014_3, letter_2014_4, letter_2014_5, letter_2014_6)
letter_2014 = paste(letter_2014, collapse = ' ')
# combine
letters = bind_rows(
letter_2017 %>% as_data_frame() %>% mutate(year = '2017'),
letter_2016 %>% as_data_frame() %>% mutate(year = '2016'),
letter_2015 %>% as_data_frame() %>% mutate(year = '2015'),
letter_2014 %>% as_data_frame() %>% mutate(year = '2014'))
letters_sentences = bind_rows(
letter_2017 %>% as_data_frame() %>% mutate(year = '2017')
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()),
letter_2016 %>% as_data_frame() %>% mutate(year = '2016')
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()),
letter_2015 %>% as_data_frame() %>% mutate(year = '2015')
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()),
letter_2014 %>% as_data_frame() %>% mutate(year = '2014')
%>% unnest_tokens(sentence, value, token = 'sentences') %>% mutate(sentence_no = row_number()))
# unnest
letters_words = letters_sentences %>%
unnest_tokens(word, sentence) %>%
anti_join(stop_words) %>%
filter(!str_detect(word, '^[0-9]')) %>%
select(year, sentence = sentence_no, word)
# number of words, unique words by year
bind_rows(
letters_words %>% group_by(year) %>% count(sort = TRUE) %>% mutate(type = 'Total Words'),
letters_words %>% group_by(year, word) %>% count(sort = TRUE) %>% ungroup() %>%
group_by(year) %>% count(sort = TRUE) %>% select(year, n = nn) %>% mutate(type = 'Unique Words')
) %>%
ungroup() %>%
mutate(year = as.numeric(year)) %>%
ggplot(., aes(x = year, y = n, color = type, text = year)) +
geom_line(size = 3, linetype = 1) +
geom_point(aes(text = sprintf("letter: %s<br>Letter: %s", year, n))) +
scale_color_manual(values = c(corp_color_2, corp_color_3)) +
scale_y_continuous(label = comma, limits = c(0, 10000)) +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.title = element_blank()) +
labs(title = 'Total & Unique Number of Words',
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017')
# top 20 words overall
letters_words %>%
group_by(word) %>%
count(sort = TRUE) %>%
ungroup() %>%
top_n(n = 10) %>%
arrange(n) %>%
mutate(word = factor(word, unique(word))) %>%
ungroup() %>%
ggplot(aes(word, n)) +
geom_col(show.legend = FALSE, fill = corp_color_2) +
coord_flip() +
theme_minimal() +
theme(axis.title.y = element_blank()) +
labs(y = 'Count',
title = 'Top 10 Most Common Words',
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017')
# sentence-level sentiment
letters_words %>%
inner_join(get_sentiments('afinn')) %>%
group_by(year, index = sentence %/% 20) %>%
summarise(sentiment = sum(score)) %>%
ggplot(., aes(index, sentiment, fill = year)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c(corp_color_1, corp_color_2, corp_color_3, corp_color_4)) +
facet_wrap(~ year, ncol = 2, scales = "free_y") +
scale_y_continuous(limits = c(-50, 75)) +
theme_minimal() +
labs(x = 'Text Block',
y = 'Sentiment Score',
title = 'Average Sentiment over Block by Year',
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017')
# pull out sample text from sections with very high or very low sentiment
letters_words %>%
inner_join(get_sentiments('afinn')) %>%
group_by(year, sentence) %>%
summarise(sentiment = sum(score)) %>%
ungroup() %>%
bottom_n(5)
letters_sentences %>%
filter(year == '2014') %>%
filter(sentence_no == 155) %>%
select(sentence) %>% as.character()
letters_words %>%
inner_join(get_sentiments('afinn')) %>%
group_by(year, sentence) %>%
summarise(sentiment = sum(score)) %>%
ungroup() %>%
top_n(3, -sentiment)
letters_sentences %>%
filter(year == '2014') %>%
filter(sentence_no == 545) %>%
select(sentence) %>% as.character()
letters_words %>%
inner_join(get_sentiments('afinn')) %>%
group_by(year, sentence) %>%
summarise(sentiment = sum(score)) %>%
filter(year == '2014') %>%
filter(sentence == 545)
# top contributing words to sentiment, 2017
letters_words %>%
filter(year == 2017) %>%
inner_join(get_sentiments('bing')) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c(corp_color_2, corp_color_3)) +
facet_wrap(~sentiment, scales = "free_y", ncol = 4) +
coord_flip() +
theme_minimal() +
labs(x = NULL,
y = 'Contribution to Sentiment',
title = 'Top 10 Words by Contribution to Sentiment',
subtitle = 'Jamie Dimon Shareholder 2017 Letter')
# top contributing words to sentiment by year
letters_words %>%
inner_join(get_sentiments('bing')) %>%
count(year, word, sentiment, sort = TRUE) %>%
ungroup() %>%
group_by(year, sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c(corp_color_2, corp_color_3)) +
facet_wrap(year ~ sentiment, scales = "free_y", ncol = 4) +
coord_flip() +
theme_minimal() +
labs(x = NULL,
y = 'Contribution to Sentiment',
title = 'Top 10 Words by Contribution to Sentiment by Year',
subtitle = 'Jamie Dimon Shareholder Letters, 2014-2017')
# tf-idf
letters_words %>%
group_by(year, word) %>%
count(sort = TRUE) %>%
bind_tf_idf(word, year, n) %>%
arrange(desc(tf_idf)) %>%
ungroup() %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(year) %>%
top_n(10) %>%
ungroup %>%
arrange(year, desc(tf_idf)) %>%
filter(year == '2016') %>%
ggplot(aes(word, tf_idf, fill = year)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c(corp_color_4)) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 2, scales = "free") +
coord_flip() +
theme_minimal() +
labs(x = NULL,
y = 'td-idf',
title = 'Top 10 TF-IDF Scores',
subtitle = 'Jamie Dimon Shareholder Letters, 2016')
# bigram tf-idf
letters_sentences %>%
unnest_tokens(bigram, sentence, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
filter(!str_detect(word1, '^[0-9]'),
!str_detect(word2, '^[0-9]')) %>%
unite(bigram, word1, word2, sep = ' ') %>%
count(year, bigram) %>%
bind_tf_idf(bigram, year, n) %>%
arrange(desc(tf_idf)) %>%
filter(year == '2017') %>%
ungroup() %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(year) %>%
top_n(9) %>%
ungroup %>%
ggplot(aes(bigram, tf_idf, fill = year)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c(corp_color_3)) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 2, scales = "free") +
coord_flip() +
theme_minimal() +
labs(x = NULL,
y = 'td-idf',
title = 'Top 9 Bigram TF-IDF Scores',
subtitle = 'Jamie Dimon Shareholder Letters, 2017')
# trigram tf-idf
letters_sentences %>%
unnest_tokens(trigram, sentence, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
filter(!str_detect(word1, '^[0-9]'),
!str_detect(word2, '^[0-9]'),
!str_detect(word3, '^[0-9]')) %>%
unite(trigram, word1, word2, word3, sep = ' ') %>%
count(year, trigram) %>%
bind_tf_idf(trigram, year, n) %>%
arrange(desc(tf_idf)) %>%
ungroup() %>%
mutate(trigram = factor(trigram, levels = rev(unique(trigram)))) %>%
group_by(year) %>%
top_n(8) %>%
ungroup %>%
ggplot(aes(trigram, tf_idf, fill = year)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 2, scales = "free") +
coord_flip()
# bigram count
letters_sentences %>%
unnest_tokens(bigram, sentence, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
filter(!str_detect(word1, '^[0-9]'),
!str_detect(word2, '^[0-9]')) %>%
filter(word1 != 'jpmorgan', word2 != 'chase') %>%
unite(bigram, word1, word2, sep = ' ') %>%
count(year, bigram) %>%
arrange(desc(n)) %>%
ungroup() %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(year) %>%
top_n(8) %>%
ungroup %>%
ggplot(aes(bigram, n, fill = year)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 2, scales = "free") +
coord_flip()
# trigram count
letters_sentences %>%
unnest_tokens(trigram, sentence, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
filter(!str_detect(word1, '^[0-9]'),
!str_detect(word2, '^[0-9]'),
!str_detect(word3, '^[0-9]')) %>%
unite(trigram, word1, word2, word3, sep = ' ') %>%
count(year, trigram) %>%
arrange(desc(n)) %>%
ungroup() %>%
mutate(trigram = factor(trigram, levels = rev(unique(trigram)))) %>%
group_by(year) %>%
top_n(8) %>%
ungroup %>%
ggplot(aes(trigram, n, fill = year)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 2, scales = "free") +
coord_flip()
library(igraph)
library(ggraph)
# bigram network (final)
p10 = letters_sentences %>%
unnest_tokens(bigram, sentence, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
filter(!str_detect(word1, '^[0-9]'),
!str_detect(word2, '^[0-9]')) %>%
filter(word1 != 'jpmorgan', word2 != 'chase') %>%
count(word1, word2, sort = TRUE) %>%
filter(n > 8) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = corp_color_3) +
geom_node_point(size = 2) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment