imjakedaniels/Tokenization.rmd

## Tokenization.rmd
---
title: "StarWars"
output: html_document
---

```{r }
#https://kkulma.github.io/2017-12-16-star-wars-vs-star-trek-word-battle/
```


```{r}
#packages

install.packages("rvest")
library(rvest)
install.packages("dplyr")
library(dplyr)
install.packages("tm")
library(tm)
install.packages("tidytext")
library(tidytext)
install.packages("ggthemes")
library(ggthemes)
install.packages("ggplot2")
library(ggplot2)
install.packages("DT")
library(DT)
```


```{r}
#find your scripts and read html

swIV_url <-"http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html"
startrek_url <- "http://www.dailyscript.com/scripts/startrek01.html"

star_wars <- read_html(swIV_url) %>%
  html_nodes("td") %>%
  html_text() %>%
  .[[88]]

star_trek <- read_html(startrek_url) %>%
  html_nodes("pre") %>%
  html_text()
```


```{r}
#clean

clean_text <- function(x) {
  #specific errors in the text conversion
x <- gsub(pattern="\\\n", replacement=" ", x)
x <- gsub(pattern="\\\r", replacement=" ", x)
x <- gsub(pattern="\\\t", replacement=" ", x)

  #remove punctuation
x <- gsub(pattern="[[:punct:]]", replacement=" ", x)

  #basic R "code"tolower()" to remove capital letters, and two tm package codes that remove numbers and empty space all piped into x.
     x <- x %>%
         tolower() %>%
         removeNumbers() %>%
         stripWhitespace()
     x}
clean_star_trek <- clean_text(star_trek)
clean_star_wars <- clean_text(star_wars)
```

```{r}
#tokenization SW

sw_tokens <- clean_star_wars %>%
  as_tibble() %>%

  #naming a column
  rename_(sw_text = names(.)[1]) %>%

  mutate_if(is.factor, as.character) %>%
  mutate(swt=unlist(sw_text)) %>%
  unique() %>%

  #Split a column into tokens using the tokenizers package

  unnest_tokens("word", sw_text) %>%

  #new kind of join I learned. It "filters" two tupples rather than "mutates" (ex. inner join, full join) Try ?anti_join
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  rename(sw_n = n)

sw_tokens
```

```{r}
#tokenization ST

st_tokens <- clean_star_trek %>%
  as_tibble() %>%
  rename_(st_text = names(.)[1]) %>%
  mutate_if(is.factor, as.character) %>%
  mutate(stt=unlist(st_text)) %>%
  unique() %>%
  unnest_tokens("word", st_text) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  rename(st_n = n)

st_tokens
```

```{r}
#Combining the similarities and removing 0s via inner join

final_tokens = sw_tokens %>%
  inner_join(st_tokens) %>%
  mutate(log_word_prop =  round(log(sw_n / st_n),3),
         dominates_in = as.factor(ifelse(log_word_prop > 0, "star_wars", "star_trek")))

#Re: log_word_prop: "To compare the frequencies I’ll use a variable called word_prop: logarithm of the proportion between Star Wars and Star Trek word frequency. This means that the more positive the value, the more drastic the difference in frequency it is in favour of Star Wars. On the other hand, the more negative the value, the more commonly it was used in Star Trek (in comparison to Star Wars)."
final_tokens
```


```{r}
#plotting
set.seed(13)

final_tokens %>%
  filter(abs(log_word_prop) == 0) %>%
  arrange(desc(sw_n)) %>%
  sample_n(30) %>%
  ggplot(aes(x = reorder(word, log_word_prop),  y = log_word_prop, fill = dominates_in)) +
  geom_bar(stat  = "identity", show.legend = FALSE) +
  theme_minimal() +
  coord_flip() +
  xlab("") +
  ylab("log(word_prop)") +
  scale_fill_brewer(palette = "Set1") +
  ggtitle("Sample of words that occur with the same frequency in SW and ST")

#remove the irrelevant words, arrange by strongest correlations, might take some time
```


```{r}
final_tokens %>%
  filter(abs(log_word_prop) > 2.4) %>%
  ggplot(aes(x = reorder(word, log_word_prop),  y = log_word_prop, fill = dominates_in)) +
  geom_bar(stat  = "identity") +
  theme_minimal() +
  coord_flip() +
  xlab("") +
  ylab("log(word_prop)") +
  scale_fill_brewer(palette = "Set1") +
  ggtitle("Words that show strikingly different frequencies in Star Wars and Star Trek")
```
	---
	title: "StarWars"
	output: html_document
	---

	```{r }
	#https://kkulma.github.io/2017-12-16-star-wars-vs-star-trek-word-battle/
	```



	```{r}
	#packages

	install.packages("rvest")
	library(rvest)
	install.packages("dplyr")
	library(dplyr)
	install.packages("tm")
	library(tm)
	install.packages("tidytext")
	library(tidytext)
	install.packages("ggthemes")
	library(ggthemes)
	install.packages("ggplot2")
	library(ggplot2)
	install.packages("DT")
	library(DT)
	```


	```{r}
	#find your scripts and read html

	swIV_url <-"http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html"
	startrek_url <- "http://www.dailyscript.com/scripts/startrek01.html"

	star_wars <- read_html(swIV_url) %>%
	html_nodes("td") %>%
	html_text() %>%
	.[[88]]

	star_trek <- read_html(startrek_url) %>%
	html_nodes("pre") %>%
	html_text()
	```



	```{r}
	#clean

	clean_text <- function(x) {
	#specific errors in the text conversion
	x <- gsub(pattern="\\\n", replacement=" ", x)
	x <- gsub(pattern="\\\r", replacement=" ", x)
	x <- gsub(pattern="\\\t", replacement=" ", x)

	#remove punctuation
	x <- gsub(pattern="[[:punct:]]", replacement=" ", x)

	#basic R "code"tolower()" to remove capital letters, and two tm package codes that remove numbers and empty space all piped into x.
	x <- x %>%
	tolower() %>%
	removeNumbers() %>%
	stripWhitespace()
	x}
	clean_star_trek <- clean_text(star_trek)
	clean_star_wars <- clean_text(star_wars)
	```

	```{r}
	#tokenization SW

	sw_tokens <- clean_star_wars %>%
	as_tibble() %>%

	#naming a column
	rename_(sw_text = names(.)[1]) %>%

	mutate_if(is.factor, as.character) %>%
	mutate(swt=unlist(sw_text)) %>%
	unique() %>%

	#Split a column into tokens using the tokenizers package

	unnest_tokens("word", sw_text) %>%

	#new kind of join I learned. It "filters" two tupples rather than "mutates" (ex. inner join, full join) Try ?anti_join
	anti_join(stop_words) %>%
	count(word, sort = TRUE) %>%
	rename(sw_n = n)

	sw_tokens
	```

	```{r}
	#tokenization ST

	st_tokens <- clean_star_trek %>%
	as_tibble() %>%
	rename_(st_text = names(.)[1]) %>%
	mutate_if(is.factor, as.character) %>%
	mutate(stt=unlist(st_text)) %>%
	unique() %>%
	unnest_tokens("word", st_text) %>%
	anti_join(stop_words) %>%
	count(word, sort = TRUE) %>%
	rename(st_n = n)

	st_tokens
	```

	```{r}
	#Combining the similarities and removing 0s via inner join

	final_tokens = sw_tokens %>%
	inner_join(st_tokens) %>%
	mutate(log_word_prop = round(log(sw_n / st_n),3),
	dominates_in = as.factor(ifelse(log_word_prop > 0, "star_wars", "star_trek")))

	#Re: log_word_prop: "To compare the frequencies I’ll use a variable called word_prop: logarithm of the proportion between Star Wars and Star Trek word frequency. This means that the more positive the value, the more drastic the difference in frequency it is in favour of Star Wars. On the other hand, the more negative the value, the more commonly it was used in Star Trek (in comparison to Star Wars)."
	final_tokens
	```


	```{r}
	#plotting
	set.seed(13)

	final_tokens %>%
	filter(abs(log_word_prop) == 0) %>%
	arrange(desc(sw_n)) %>%
	sample_n(30) %>%
	ggplot(aes(x = reorder(word, log_word_prop), y = log_word_prop, fill = dominates_in)) +
	geom_bar(stat = "identity", show.legend = FALSE) +
	theme_minimal() +
	coord_flip() +
	xlab("") +
	ylab("log(word_prop)") +
	scale_fill_brewer(palette = "Set1") +
	ggtitle("Sample of words that occur with the same frequency in SW and ST")

	#remove the irrelevant words, arrange by strongest correlations, might take some time
	```


	```{r}
	final_tokens %>%
	filter(abs(log_word_prop) > 2.4) %>%
	ggplot(aes(x = reorder(word, log_word_prop), y = log_word_prop, fill = dominates_in)) +
	geom_bar(stat = "identity") +
	theme_minimal() +
	coord_flip() +
	xlab("") +
	ylab("log(word_prop)") +
	scale_fill_brewer(palette = "Set1") +
	ggtitle("Words that show strikingly different frequencies in Star Wars and Star Trek")
	```