bayesball/christy.R

## christy.R
# load in some packages

library(tidytext)
library(tidyverse)
library(gutenbergr)
library(wordcloud)
library(Lahman)

# load in Christy book

cm <- gutenberg_download(33291)

# create a data frame of lines

text_df <- data.frame(line = 1:length(cm),
                      text = as.character(cm),
                      stringsAsFactors = FALSE)

# convert to token data frame

text_df %>%
  unnest_tokens(word, text, to_lower = TRUE) ->
  tidy_text

# remove stop words

data(stop_words)
tidy_text %>%
  anti_join(stop_words) -> tidy_text

# frequency table of words

tidy_text %>%
  count(word, sort = TRUE)

# graph

tidy_text %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  filter(! word %in% c("33291")) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

# word cloud

tidy_text %>%
  count(word) %>%
  filter(! word %in% c("33291")) %>%
  mutate(word = reorder(word, n)) %>%
  with(wordcloud(word, n, max.words = 40))

########### long words?

tidy_text %>% mutate(Length = str_length(word)) ->
  tidy_text

### find popular long words

tidy_text %>% count(word) %>%
  mutate(Length = str_length(word)) %>%
  arrange(desc(Length)) %>% slice(1:20)


### what players are mentioned?

select(Master, nameLast) %>%
  count(nameLast) %>% select(nameLast) -> MS
text_df %>%
  unnest_tokens(word, text, to_lower = FALSE)  %>%
  count(word) -> S
inner_join(S, MS, by=c("word" = "nameLast")) %>%
  arrange(desc(n)) -> S2

png("myplot.png")
with(slice(S2, -(1:2)),
     wordcloud(word, n,
            max.words = 30))
dev.off()
	# load in some packages

	library(tidytext)
	library(tidyverse)
	library(gutenbergr)
	library(wordcloud)
	library(Lahman)

	# load in Christy book

	cm <- gutenberg_download(33291)

	# create a data frame of lines

	text_df <- data.frame(line = 1:length(cm),
	text = as.character(cm),
	stringsAsFactors = FALSE)

	# convert to token data frame

	text_df %>%
	unnest_tokens(word, text, to_lower = TRUE) ->
	tidy_text

	# remove stop words

	data(stop_words)
	tidy_text %>%
	anti_join(stop_words) -> tidy_text

	# frequency table of words

	tidy_text %>%
	count(word, sort = TRUE)

	# graph

	tidy_text %>%
	count(word, sort = TRUE) %>%
	filter(n > 50) %>%
	filter(! word %in% c("33291")) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n)) +
	geom_col() +
	xlab(NULL) +
	coord_flip()

	# word cloud

	tidy_text %>%
	count(word) %>%
	filter(! word %in% c("33291")) %>%
	mutate(word = reorder(word, n)) %>%
	with(wordcloud(word, n, max.words = 40))

	########### long words?

	tidy_text %>% mutate(Length = str_length(word)) ->
	tidy_text

	### find popular long words

	tidy_text %>% count(word) %>%
	mutate(Length = str_length(word)) %>%
	arrange(desc(Length)) %>% slice(1:20)


	### what players are mentioned?

	select(Master, nameLast) %>%
	count(nameLast) %>% select(nameLast) -> MS
	text_df %>%
	unnest_tokens(word, text, to_lower = FALSE) %>%
	count(word) -> S
	inner_join(S, MS, by=c("word" = "nameLast")) %>%
	arrange(desc(n)) -> S2

	png("myplot.png")
	with(slice(S2, -(1:2)),
	wordcloud(word, n,
	max.words = 30))
	dev.off()