marceloszilagyi/gist:19b27b538799fb1020731764aae2323c

## gistfile1.txt
# this script validates the prediction using a test dataset
# Library Load ------------------------------------------------------------

listpackages = c('tm', 'tidyverse','ggplot2','scales','DT', 'tidyr', 'igraph','magrittr','gridExtra','readr','stringi','stringr','textclean','reshape2', 'tidytext','data.table')
loaded = suppressMessages(suppressWarnings(
  sapply(listpackages, function (x) library(x,character.only = T))
))

rm(list = c('listpackages','loaded'))

library("tidyverse")
library("stringr")
library("textclean")
library("lexicon")
library("magrittr")
library(shiny)
library(stringi)


# Load support files -----------------------------------------------------------

# get the badwords
badwords_file = list.files(recursive = TRUE, pattern = glob2rx('*badwords.txt'))
if(length(badwords_file)==0) {
  download.file('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',"badwords.txt")}
if(!("badwords" %in% ls())) {
  badwords = as_tibble(read.csv(badwords_file[1])); colnames(badwords) <- "word"}

# get the dictionary
dictionary_file = list.files(recursive = TRUE, pattern = glob2rx('*dictionary.txt'))
if(length(dictionary_file)==0){
  download.file('https://raw.githubusercontent.com/dwyl/english-words/master/words3.txt',destfile = "dictionary.txt")
}

suppressMessages(
  suppressWarnings(
    if(!("dictionary" %in% ls())) {
      dictionary = read_csv(dictionary_file[1],col_names = "word")
      dictionary  = dictionary %>% mutate(dictionary="dictionary")
    }))

# add "BADWORD" as a word in the dictionary
dictionary = dictionary %>% add_row(word = c("badword","BADWORD"), dictionary = c("dictionary","dictionary"))

# get a stoplist
if(!("otherstoplist" %in% ls())){
  otherstoplist = as_tibble(tm::stopwords("en")) %>% rename(word=value)
  otherstoplist = otherstoplist %>% mutate(n_ss= dense_rank(str_detect(otherstoplist$word,"'"))) %>% arrange(desc(n_ss))
}

# get a contraction list and expand it
contractions = lexicon::key_contractions
addedcontractons = bind_cols(contraction = str_replace_all(contractions$contraction,"'",""),
                             expanded = contractions$expanded)
contractions = bind_rows(contractions,addedcontractons)
contractions = contractions %>% filter(contraction != "its")
contrac_repl = contractions$expanded
names(contrac_repl) <- paste0("\\b",contractions$contraction,"\\b") # to force the edge of words

# Function to expand contractions and convert numbers to "num" -----
expand_contraction = function (text) {
  text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
  text %<>% tolower()
  text %<>% str_replace_all(pattern = "\\b\\w*\\d,*\\.*\\w*,*\\.*\\b", replacement = "num") # the real regex is \b\w*\d,*\.*\w*,*\.*\b
  text %<>% str_replace_all(pattern = "\\b(num)\\W*\\S*(num)\\b", replacement = "num")
  {for (i in (seq_along(contrac_repl)))
    text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
  }
  # after the contractions, replace the any 's after the word
  text %<>% str_replace_all("'s","")
  return(text)
}


# READ THE TEST DATASET  - remove badwords ------------------------------------------------------

wordbyword_dev = fread("wordbyword_test.csv")

# replace badwords with the mark "BADWORD"
wordbyword_dev = wordbyword_dev %>% mutate(word=replace(word, word %in% badwords$word, "badword"))

# here I manually add  new words to dictionary based on the results above
newwords = c("badword","num","lol","blog","obama","facebook","omg","website","nfl","nba","ceo","google", "u.s","a.m","u.k", "p.m")
newtibble = as.tibble(cbind(word = newwords,dictionary = "newwords"))
dictionary = rbind(dictionary,newtibble)

'%ni%' <- Negate('%in%')
# calculate the unigrams for words that are not dictionary words
notreallywords = wordbyword_dev %>% filter (word %ni% dictionary$word)
unigramsnonwords =  notreallywords %>% count(word) %>% arrange(desc(n))
unigramsnonwords

# replace non dictionary with the mark "UNKWORD".
# https://stackoverflow.com/questions/38351820/negation-of-in-in-r
`%nin%` = Negate(`%in%`)
wordbyword_dev = wordbyword_dev %>% mutate(original_word =word, word= replace(word, word %nin% dictionary$word, "UNKWORD"))

# Add words to the tidy table for future calculation--------------

# add bigrams to the table
wordbyword_dev = wordbyword_dev %>% mutate (next_word = if_else(lead(linenumber)==linenumber,lead(word),""))

# add trigrams to the table
wordbyword_dev = wordbyword_dev %>% mutate (sec_next_word = if_else(lead(linenumber, n=2)==linenumber,lead(word, n=2),""))

# add fourgrams to the table
wordbyword_dev = wordbyword_dev %>% mutate (third_next_word = if_else(lead(linenumber, n=3)==linenumber,lead(word, n=3),""))

# add 5 grams to the table
wordbyword_dev = wordbyword_dev %>% mutate (fourth_next_word = if_else(lead(linenumber, n=4)==linenumber,lead(word, n=4),""))

write_csv(wordbyword_dev,"wordbyword_test_ready_only_dic.csv")

# Create unigrams, bigrams...-----

if(file.exists("wordbyword_test_ready_only_dic.csv", recursive=TRUE)){
  wordbyword_dev = fread(list.files(pattern = "wordbyword_test_ready_only_dic.csv",recursive=TRUE)[1])
}

# this function excludes cases that are 'end of message'
# reference here - https://stackoverflow.com/questions/37363583/dplyr-filter-if-any-variable-is-equal-to-a-value

exclude <- function(a,test_val,na.rm=T)
{out <- a %>% filter(!rowSums(a==test_val,na.rm=na.rm))
return(out)
}

unigrams =  wordbyword_dev %>% count(word) %>% arrange(desc(n)) %>% mutate(uniprop = n/sum(n))
write_csv(unigrams,"unigrams_test.csv")
rm(unigrams)

bigrams =   wordbyword_dev %>% count(word,next_word) %>% arrange(desc(n)) %>% group_by(word) %>% mutate(biprop = n/sum(n))
write_csv(bigrams,"bigrams_test.csv")
rm(bigrams)

trigrams =  wordbyword_dev %>% count(word,next_word,sec_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word) %>% mutate(triprop = n/sum(n))
write_csv(trigrams,"trigrams_test.csv")
rm(trigrams)

fourgrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word) %>% mutate(fourprop = n/sum(n))
write_csv(fourgrams,"fourgrams_test.csv")
rm(fourgrams)

fivegrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word,fourth_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word,third_next_word) %>% mutate(fiveprop = n/sum(n))
write_csv(fivegrams,"fivegrams_test.csv")
rm(fivegrams)

# Retrive the ngrams
fivegrams_test <- fread("fivegrams_test.csv")
fourgrams_test <- fread("fourgrams_test.csv")
trigrams_test <- fread("trigrams_test.csv")
bigrams_test <- fread("bigrams_test.csv")
unigrams_test <- fread("unigrams_test.csv")

# remove the "" (end of prhase)
bigrams_test <- bigrams_test %>%  filter(next_word!="")
trigrams_test <- trigrams_test %>%  filter(next_word != "", sec_next_word != "")
fourgrams_test <- fourgrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "")
fivegrams_test <- fivegrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "", fourth_next_word != "")

# count the loss due to pruning

total_five <- sum(fivegrams_test$n)
total_four <- sum(fourgrams_test$n)
total_trig <- sum(trigrams_test$n)
total_twog <- sum(bigrams_test$n)
total_unig <- sum(unigrams_test$n)

# prune
fivegrams_test = fivegrams_test %>% filter(n>3)
fourgrams_test = fourgrams_test %>% filter(n>2)
trigrams_test  = trigrams_test %>% filter(n>2)
bigrams_test   = bigrams_test %>% filter(n>1)

# loss
lossfive_test = sum(fivegrams_test$n)/total_five
lossfour_test = sum(fourgrams_test$n)/total_four
losstrig_test = sum(trigrams_test$n)/total_trig
losstwog_test = sum(bigrams_test$n)/total_twog


# stupid backoff function ------
phrase = "every inch of you is perfect from the bottom to the"
wordpredict <- function (phrase) {
  # pass by the dictionary and contraction
  phrase <- expand_contraction(phrase)
  #splittext <- str_split(phrase," ",simplify = T)
  splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
  splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
  splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
  lengthtext = length(splittext)
  backoff_index = ifelse(lengthtext>4,4,lengthtext)
  # pick at maximum four words
  if (lengthtext >1) {word_search_last = splittext[lengthtext]}
  if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
  if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
  if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}

  if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}

  if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}

  if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}

  if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}

  if (lengthtext == 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}


  chance_final <- rbind(if(exists("fivechance")){fivechance},
                        if(exists("fourchance")){fourchance},
                        if(exists("threechance")){threechance},
                        if(exists("bichance")){bichance},
                        if(exists("unichance")){unichance})

  if(exists("fivechance")){rm(fivechance)}
  if(exists("fourchance")){rm(fourchance)}
  if(exists("threechance")){rm(threechance)}
  if(exists("bichance")){rm(bichance)}
  if(exists("unichance")){rm(unichance)}

  ifelse(exists("chance_final"),
         chance_final2 <- chance_final %>% arrange(selection,desc(chance)) %>% distinct(selection,.keep_all = TRUE),
         chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
  return(list(splittext,chance_final2,lengthtext,backoff_index))
}

# retrieve the dev grams to run the function----
fivegrams <- fread("lean_five.csv")
fourgrams <- fread("lean_four.csv")
trigrams <- fread("lean_tri.csv")
bigrams <- fread("lean_bi.csv")
unigrams <- fread("lean_uni.csv")

# run the function for the fivegrams ----

wordpredict("to live and let")[[2]]$selection

un# write a function to return T or F and store in the fivegrams

checkmatch <- function(aword, bword, cword, dword, eword){

  results = paste0(aword, bword, cword, dword, collapse = " ") %>% wordpredict() %>% extract2(2) %>% select(selection)
  checkmatch <- is.element(eword[1],results$selection)
  return(checkmatch)
}


fivegrams_test = fivegrams_test %>% mutate(result = checkmatch(word,next_word,sec_next_word,third_next_word,fourth_next_word))
	# this script validates the prediction using a test dataset
	# Library Load ------------------------------------------------------------

	listpackages = c('tm', 'tidyverse','ggplot2','scales','DT', 'tidyr', 'igraph','magrittr','gridExtra','readr','stringi','stringr','textclean','reshape2', 'tidytext','data.table')
	loaded = suppressMessages(suppressWarnings(
	sapply(listpackages, function (x) library(x,character.only = T))
	))

	rm(list = c('listpackages','loaded'))

	library("tidyverse")
	library("stringr")
	library("textclean")
	library("lexicon")
	library("magrittr")
	library(shiny)
	library(stringi)


	# Load support files -----------------------------------------------------------

	# get the badwords
	badwords_file = list.files(recursive = TRUE, pattern = glob2rx('*badwords.txt'))
	if(length(badwords_file)==0) {
	download.file('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',"badwords.txt")}
	if(!("badwords" %in% ls())) {
	badwords = as_tibble(read.csv(badwords_file[1])); colnames(badwords) <- "word"}

	# get the dictionary
	dictionary_file = list.files(recursive = TRUE, pattern = glob2rx('*dictionary.txt'))
	if(length(dictionary_file)==0){
	download.file('https://raw.githubusercontent.com/dwyl/english-words/master/words3.txt',destfile = "dictionary.txt")
	}

	suppressMessages(
	suppressWarnings(
	if(!("dictionary" %in% ls())) {
	dictionary = read_csv(dictionary_file[1],col_names = "word")
	dictionary = dictionary %>% mutate(dictionary="dictionary")
	}))

	# add "BADWORD" as a word in the dictionary
	dictionary = dictionary %>% add_row(word = c("badword","BADWORD"), dictionary = c("dictionary","dictionary"))

	# get a stoplist
	if(!("otherstoplist" %in% ls())){
	otherstoplist = as_tibble(tm::stopwords("en")) %>% rename(word=value)
	otherstoplist = otherstoplist %>% mutate(n_ss= dense_rank(str_detect(otherstoplist$word,"'"))) %>% arrange(desc(n_ss))
	}

	# get a contraction list and expand it
	contractions = lexicon::key_contractions
	addedcontractons = bind_cols(contraction = str_replace_all(contractions$contraction,"'",""),
	expanded = contractions$expanded)
	contractions = bind_rows(contractions,addedcontractons)
	contractions = contractions %>% filter(contraction != "its")
	contrac_repl = contractions$expanded
	names(contrac_repl) <- paste0("\\b",contractions$contraction,"\\b") # to force the edge of words

	# Function to expand contractions and convert numbers to "num" -----
	expand_contraction = function (text) {
	text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
	text %<>% tolower()
	text %<>% str_replace_all(pattern = "\\b\\w\\d,\\.\\w,\\.\\b", replacement = "num") # the real regex is \b\w\d,\.\w,\.\b
	text %<>% str_replace_all(pattern = "\\b(num)\\W\\S(num)\\b", replacement = "num")
	{for (i in (seq_along(contrac_repl)))
	text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
	}
	# after the contractions, replace the any 's after the word
	text %<>% str_replace_all("'s","")
	return(text)
	}


	# READ THE TEST DATASET - remove badwords ------------------------------------------------------

	wordbyword_dev = fread("wordbyword_test.csv")

	# replace badwords with the mark "BADWORD"
	wordbyword_dev = wordbyword_dev %>% mutate(word=replace(word, word %in% badwords$word, "badword"))

	# here I manually add new words to dictionary based on the results above
	newwords = c("badword","num","lol","blog","obama","facebook","omg","website","nfl","nba","ceo","google", "u.s","a.m","u.k", "p.m")
	newtibble = as.tibble(cbind(word = newwords,dictionary = "newwords"))
	dictionary = rbind(dictionary,newtibble)

	'%ni%' <- Negate('%in%')
	# calculate the unigrams for words that are not dictionary words
	notreallywords = wordbyword_dev %>% filter (word %ni% dictionary$word)
	unigramsnonwords = notreallywords %>% count(word) %>% arrange(desc(n))
	unigramsnonwords

	# replace non dictionary with the mark "UNKWORD".
	# https://stackoverflow.com/questions/38351820/negation-of-in-in-r
	`%nin%` = Negate(`%in%`)
	wordbyword_dev = wordbyword_dev %>% mutate(original_word =word, word= replace(word, word %nin% dictionary$word, "UNKWORD"))

	# Add words to the tidy table for future calculation--------------

	# add bigrams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (next_word = if_else(lead(linenumber)==linenumber,lead(word),""))

	# add trigrams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (sec_next_word = if_else(lead(linenumber, n=2)==linenumber,lead(word, n=2),""))

	# add fourgrams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (third_next_word = if_else(lead(linenumber, n=3)==linenumber,lead(word, n=3),""))

	# add 5 grams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (fourth_next_word = if_else(lead(linenumber, n=4)==linenumber,lead(word, n=4),""))

	write_csv(wordbyword_dev,"wordbyword_test_ready_only_dic.csv")

	# Create unigrams, bigrams...-----

	if(file.exists("wordbyword_test_ready_only_dic.csv", recursive=TRUE)){
	wordbyword_dev = fread(list.files(pattern = "wordbyword_test_ready_only_dic.csv",recursive=TRUE)[1])
	}

	# this function excludes cases that are 'end of message'
	# reference here - https://stackoverflow.com/questions/37363583/dplyr-filter-if-any-variable-is-equal-to-a-value

	exclude <- function(a,test_val,na.rm=T)
	{out <- a %>% filter(!rowSums(a==test_val,na.rm=na.rm))
	return(out)
	}

	unigrams = wordbyword_dev %>% count(word) %>% arrange(desc(n)) %>% mutate(uniprop = n/sum(n))
	write_csv(unigrams,"unigrams_test.csv")
	rm(unigrams)

	bigrams = wordbyword_dev %>% count(word,next_word) %>% arrange(desc(n)) %>% group_by(word) %>% mutate(biprop = n/sum(n))
	write_csv(bigrams,"bigrams_test.csv")
	rm(bigrams)

	trigrams = wordbyword_dev %>% count(word,next_word,sec_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word) %>% mutate(triprop = n/sum(n))
	write_csv(trigrams,"trigrams_test.csv")
	rm(trigrams)

	fourgrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word) %>% mutate(fourprop = n/sum(n))
	write_csv(fourgrams,"fourgrams_test.csv")
	rm(fourgrams)

	fivegrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word,fourth_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word,third_next_word) %>% mutate(fiveprop = n/sum(n))
	write_csv(fivegrams,"fivegrams_test.csv")
	rm(fivegrams)

	# Retrive the ngrams
	fivegrams_test <- fread("fivegrams_test.csv")
	fourgrams_test <- fread("fourgrams_test.csv")
	trigrams_test <- fread("trigrams_test.csv")
	bigrams_test <- fread("bigrams_test.csv")
	unigrams_test <- fread("unigrams_test.csv")

	# remove the "" (end of prhase)
	bigrams_test <- bigrams_test %>% filter(next_word!="")
	trigrams_test <- trigrams_test %>% filter(next_word != "", sec_next_word != "")
	fourgrams_test <- fourgrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "")
	fivegrams_test <- fivegrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "", fourth_next_word != "")

	# count the loss due to pruning

	total_five <- sum(fivegrams_test$n)
	total_four <- sum(fourgrams_test$n)
	total_trig <- sum(trigrams_test$n)
	total_twog <- sum(bigrams_test$n)
	total_unig <- sum(unigrams_test$n)

	# prune
	fivegrams_test = fivegrams_test %>% filter(n>3)
	fourgrams_test = fourgrams_test %>% filter(n>2)
	trigrams_test = trigrams_test %>% filter(n>2)
	bigrams_test = bigrams_test %>% filter(n>1)

	# loss
	lossfive_test = sum(fivegrams_test$n)/total_five
	lossfour_test = sum(fourgrams_test$n)/total_four
	losstrig_test = sum(trigrams_test$n)/total_trig
	losstwog_test = sum(bigrams_test$n)/total_twog


	# stupid backoff function ------
	phrase = "every inch of you is perfect from the bottom to the"
	wordpredict <- function (phrase) {
	# pass by the dictionary and contraction
	phrase <- expand_contraction(phrase)
	#splittext <- str_split(phrase," ",simplify = T)
	splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
	splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
	splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
	lengthtext = length(splittext)
	backoff_index = ifelse(lengthtext>4,4,lengthtext)
	# pick at maximum four words
	if (lengthtext >1) {word_search_last = splittext[lengthtext]}
	if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
	if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
	if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}

	if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}

	if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}

	if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}

	if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}

	if (lengthtext == 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}


	chance_final <- rbind(if(exists("fivechance")){fivechance},
	if(exists("fourchance")){fourchance},
	if(exists("threechance")){threechance},
	if(exists("bichance")){bichance},
	if(exists("unichance")){unichance})

	if(exists("fivechance")){rm(fivechance)}
	if(exists("fourchance")){rm(fourchance)}
	if(exists("threechance")){rm(threechance)}
	if(exists("bichance")){rm(bichance)}
	if(exists("unichance")){rm(unichance)}

	ifelse(exists("chance_final"),
	chance_final2 <- chance_final %>% arrange(selection,desc(chance)) %>% distinct(selection,.keep_all = TRUE),
	chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
	return(list(splittext,chance_final2,lengthtext,backoff_index))
	}

	# retrieve the dev grams to run the function----
	fivegrams <- fread("lean_five.csv")
	fourgrams <- fread("lean_four.csv")
	trigrams <- fread("lean_tri.csv")
	bigrams <- fread("lean_bi.csv")
	unigrams <- fread("lean_uni.csv")

	# run the function for the fivegrams ----

	wordpredict("to live and let")[[2]]$selection

	un# write a function to return T or F and store in the fivegrams

	checkmatch <- function(aword, bword, cword, dword, eword){

	results = paste0(aword, bword, cword, dword, collapse = " ") %>% wordpredict() %>% extract2(2) %>% select(selection)
	checkmatch <- is.element(eword[1],results$selection)
	return(checkmatch)
	}


	fivegrams_test = fivegrams_test %>% mutate(result = checkmatch(word,next_word,sec_next_word,third_next_word,fourth_next_word))