Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created November 27, 2017 14:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/d9519e14e6f93c98ffaccc6c32b19b18 to your computer and use it in GitHub Desktop.
Save bayesball/d9519e14e6f93c98ffaccc6c32b19b18 to your computer and use it in GitHub Desktop.
Text mining words from book by Christy Mathewson
# load in some packages
library(tidytext)
library(tidyverse)
library(gutenbergr)
library(wordcloud)
library(Lahman)
# load in Christy book
cm <- gutenberg_download(33291)
# create a data frame of lines
text_df <- data.frame(line = 1:length(cm),
text = as.character(cm),
stringsAsFactors = FALSE)
# convert to token data frame
text_df %>%
unnest_tokens(word, text, to_lower = TRUE) ->
tidy_text
# remove stop words
data(stop_words)
tidy_text %>%
anti_join(stop_words) -> tidy_text
# frequency table of words
tidy_text %>%
count(word, sort = TRUE)
# graph
tidy_text %>%
count(word, sort = TRUE) %>%
filter(n > 50) %>%
filter(! word %in% c("33291")) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
# word cloud
tidy_text %>%
count(word) %>%
filter(! word %in% c("33291")) %>%
mutate(word = reorder(word, n)) %>%
with(wordcloud(word, n, max.words = 40))
########### long words?
tidy_text %>% mutate(Length = str_length(word)) ->
tidy_text
### find popular long words
tidy_text %>% count(word) %>%
mutate(Length = str_length(word)) %>%
arrange(desc(Length)) %>% slice(1:20)
### what players are mentioned?
select(Master, nameLast) %>%
count(nameLast) %>% select(nameLast) -> MS
text_df %>%
unnest_tokens(word, text, to_lower = FALSE) %>%
count(word) -> S
inner_join(S, MS, by=c("word" = "nameLast")) %>%
arrange(desc(n)) -> S2
png("myplot.png")
with(slice(S2, -(1:2)),
wordcloud(word, n,
max.words = 30))
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment