Skip to content

Instantly share code, notes, and snippets.

@marceloszilagyi
Created September 6, 2017 17:19
Show Gist options
  • Save marceloszilagyi/19b27b538799fb1020731764aae2323c to your computer and use it in GitHub Desktop.
Save marceloszilagyi/19b27b538799fb1020731764aae2323c to your computer and use it in GitHub Desktop.
# this script validates the prediction using a test dataset
# Library Load ------------------------------------------------------------
listpackages = c('tm', 'tidyverse','ggplot2','scales','DT', 'tidyr', 'igraph','magrittr','gridExtra','readr','stringi','stringr','textclean','reshape2', 'tidytext','data.table')
loaded = suppressMessages(suppressWarnings(
sapply(listpackages, function (x) library(x,character.only = T))
))
rm(list = c('listpackages','loaded'))
library("tidyverse")
library("stringr")
library("textclean")
library("lexicon")
library("magrittr")
library(shiny)
library(stringi)
# Load support files -----------------------------------------------------------
# get the badwords
badwords_file = list.files(recursive = TRUE, pattern = glob2rx('*badwords.txt'))
if(length(badwords_file)==0) {
download.file('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',"badwords.txt")}
if(!("badwords" %in% ls())) {
badwords = as_tibble(read.csv(badwords_file[1])); colnames(badwords) <- "word"}
# get the dictionary
dictionary_file = list.files(recursive = TRUE, pattern = glob2rx('*dictionary.txt'))
if(length(dictionary_file)==0){
download.file('https://raw.githubusercontent.com/dwyl/english-words/master/words3.txt',destfile = "dictionary.txt")
}
suppressMessages(
suppressWarnings(
if(!("dictionary" %in% ls())) {
dictionary = read_csv(dictionary_file[1],col_names = "word")
dictionary = dictionary %>% mutate(dictionary="dictionary")
}))
# add "BADWORD" as a word in the dictionary
dictionary = dictionary %>% add_row(word = c("badword","BADWORD"), dictionary = c("dictionary","dictionary"))
# get a stoplist
if(!("otherstoplist" %in% ls())){
otherstoplist = as_tibble(tm::stopwords("en")) %>% rename(word=value)
otherstoplist = otherstoplist %>% mutate(n_ss= dense_rank(str_detect(otherstoplist$word,"'"))) %>% arrange(desc(n_ss))
}
# get a contraction list and expand it
contractions = lexicon::key_contractions
addedcontractons = bind_cols(contraction = str_replace_all(contractions$contraction,"'",""),
expanded = contractions$expanded)
contractions = bind_rows(contractions,addedcontractons)
contractions = contractions %>% filter(contraction != "its")
contrac_repl = contractions$expanded
names(contrac_repl) <- paste0("\\b",contractions$contraction,"\\b") # to force the edge of words
# Function to expand contractions and convert numbers to "num" -----
expand_contraction = function (text) {
text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
text %<>% tolower()
text %<>% str_replace_all(pattern = "\\b\\w*\\d,*\\.*\\w*,*\\.*\\b", replacement = "num") # the real regex is \b\w*\d,*\.*\w*,*\.*\b
text %<>% str_replace_all(pattern = "\\b(num)\\W*\\S*(num)\\b", replacement = "num")
{for (i in (seq_along(contrac_repl)))
text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
}
# after the contractions, replace the any 's after the word
text %<>% str_replace_all("'s","")
return(text)
}
# READ THE TEST DATASET - remove badwords ------------------------------------------------------
wordbyword_dev = fread("wordbyword_test.csv")
# replace badwords with the mark "BADWORD"
wordbyword_dev = wordbyword_dev %>% mutate(word=replace(word, word %in% badwords$word, "badword"))
# here I manually add new words to dictionary based on the results above
newwords = c("badword","num","lol","blog","obama","facebook","omg","website","nfl","nba","ceo","google", "u.s","a.m","u.k", "p.m")
newtibble = as.tibble(cbind(word = newwords,dictionary = "newwords"))
dictionary = rbind(dictionary,newtibble)
'%ni%' <- Negate('%in%')
# calculate the unigrams for words that are not dictionary words
notreallywords = wordbyword_dev %>% filter (word %ni% dictionary$word)
unigramsnonwords = notreallywords %>% count(word) %>% arrange(desc(n))
unigramsnonwords
# replace non dictionary with the mark "UNKWORD".
# https://stackoverflow.com/questions/38351820/negation-of-in-in-r
`%nin%` = Negate(`%in%`)
wordbyword_dev = wordbyword_dev %>% mutate(original_word =word, word= replace(word, word %nin% dictionary$word, "UNKWORD"))
# Add words to the tidy table for future calculation--------------
# add bigrams to the table
wordbyword_dev = wordbyword_dev %>% mutate (next_word = if_else(lead(linenumber)==linenumber,lead(word),""))
# add trigrams to the table
wordbyword_dev = wordbyword_dev %>% mutate (sec_next_word = if_else(lead(linenumber, n=2)==linenumber,lead(word, n=2),""))
# add fourgrams to the table
wordbyword_dev = wordbyword_dev %>% mutate (third_next_word = if_else(lead(linenumber, n=3)==linenumber,lead(word, n=3),""))
# add 5 grams to the table
wordbyword_dev = wordbyword_dev %>% mutate (fourth_next_word = if_else(lead(linenumber, n=4)==linenumber,lead(word, n=4),""))
write_csv(wordbyword_dev,"wordbyword_test_ready_only_dic.csv")
# Create unigrams, bigrams...-----
if(file.exists("wordbyword_test_ready_only_dic.csv", recursive=TRUE)){
wordbyword_dev = fread(list.files(pattern = "wordbyword_test_ready_only_dic.csv",recursive=TRUE)[1])
}
# this function excludes cases that are 'end of message'
# reference here - https://stackoverflow.com/questions/37363583/dplyr-filter-if-any-variable-is-equal-to-a-value
exclude <- function(a,test_val,na.rm=T)
{out <- a %>% filter(!rowSums(a==test_val,na.rm=na.rm))
return(out)
}
unigrams = wordbyword_dev %>% count(word) %>% arrange(desc(n)) %>% mutate(uniprop = n/sum(n))
write_csv(unigrams,"unigrams_test.csv")
rm(unigrams)
bigrams = wordbyword_dev %>% count(word,next_word) %>% arrange(desc(n)) %>% group_by(word) %>% mutate(biprop = n/sum(n))
write_csv(bigrams,"bigrams_test.csv")
rm(bigrams)
trigrams = wordbyword_dev %>% count(word,next_word,sec_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word) %>% mutate(triprop = n/sum(n))
write_csv(trigrams,"trigrams_test.csv")
rm(trigrams)
fourgrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word) %>% mutate(fourprop = n/sum(n))
write_csv(fourgrams,"fourgrams_test.csv")
rm(fourgrams)
fivegrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word,fourth_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word,third_next_word) %>% mutate(fiveprop = n/sum(n))
write_csv(fivegrams,"fivegrams_test.csv")
rm(fivegrams)
# Retrive the ngrams
fivegrams_test <- fread("fivegrams_test.csv")
fourgrams_test <- fread("fourgrams_test.csv")
trigrams_test <- fread("trigrams_test.csv")
bigrams_test <- fread("bigrams_test.csv")
unigrams_test <- fread("unigrams_test.csv")
# remove the "" (end of prhase)
bigrams_test <- bigrams_test %>% filter(next_word!="")
trigrams_test <- trigrams_test %>% filter(next_word != "", sec_next_word != "")
fourgrams_test <- fourgrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "")
fivegrams_test <- fivegrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "", fourth_next_word != "")
# count the loss due to pruning
total_five <- sum(fivegrams_test$n)
total_four <- sum(fourgrams_test$n)
total_trig <- sum(trigrams_test$n)
total_twog <- sum(bigrams_test$n)
total_unig <- sum(unigrams_test$n)
# prune
fivegrams_test = fivegrams_test %>% filter(n>3)
fourgrams_test = fourgrams_test %>% filter(n>2)
trigrams_test = trigrams_test %>% filter(n>2)
bigrams_test = bigrams_test %>% filter(n>1)
# loss
lossfive_test = sum(fivegrams_test$n)/total_five
lossfour_test = sum(fourgrams_test$n)/total_four
losstrig_test = sum(trigrams_test$n)/total_trig
losstwog_test = sum(bigrams_test$n)/total_twog
# stupid backoff function ------
phrase = "every inch of you is perfect from the bottom to the"
wordpredict <- function (phrase) {
# pass by the dictionary and contraction
phrase <- expand_contraction(phrase)
#splittext <- str_split(phrase," ",simplify = T)
splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
lengthtext = length(splittext)
backoff_index = ifelse(lengthtext>4,4,lengthtext)
# pick at maximum four words
if (lengthtext >1) {word_search_last = splittext[lengthtext]}
if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}
if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}
if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}
if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}
if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}
if (lengthtext == 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}
chance_final <- rbind(if(exists("fivechance")){fivechance},
if(exists("fourchance")){fourchance},
if(exists("threechance")){threechance},
if(exists("bichance")){bichance},
if(exists("unichance")){unichance})
if(exists("fivechance")){rm(fivechance)}
if(exists("fourchance")){rm(fourchance)}
if(exists("threechance")){rm(threechance)}
if(exists("bichance")){rm(bichance)}
if(exists("unichance")){rm(unichance)}
ifelse(exists("chance_final"),
chance_final2 <- chance_final %>% arrange(selection,desc(chance)) %>% distinct(selection,.keep_all = TRUE),
chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
return(list(splittext,chance_final2,lengthtext,backoff_index))
}
# retrieve the dev grams to run the function----
fivegrams <- fread("lean_five.csv")
fourgrams <- fread("lean_four.csv")
trigrams <- fread("lean_tri.csv")
bigrams <- fread("lean_bi.csv")
unigrams <- fread("lean_uni.csv")
# run the function for the fivegrams ----
wordpredict("to live and let")[[2]]$selection
un# write a function to return T or F and store in the fivegrams
checkmatch <- function(aword, bword, cword, dword, eword){
results = paste0(aword, bword, cword, dword, collapse = " ") %>% wordpredict() %>% extract2(2) %>% select(selection)
checkmatch <- is.element(eword[1],results$selection)
return(checkmatch)
}
fivegrams_test = fivegrams_test %>% mutate(result = checkmatch(word,next_word,sec_next_word,third_next_word,fourth_next_word))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment