Skip to content

Instantly share code, notes, and snippets.

@marceloszilagyi
Created September 9, 2017 01:02
Show Gist options
  • Save marceloszilagyi/19b7238f745de08f4676536e5951c7fb to your computer and use it in GitHub Desktop.
Save marceloszilagyi/19b7238f745de08f4676536e5951c7fb to your computer and use it in GitHub Desktop.
# load RDS and libraries
load("20170724.RData")
# this is function to predict only
library("tidyverse")
library("stringr")
library("textclean")
library("lexicon")
library("magrittr")
library(shiny)
library(stringi)
# Server ------ ============================================
server <- function(input, output, session) {
# Function to expand contractions and convert numbers to "num" -----
expand_contraction = function (text) {
text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
text %<>% tolower()
text %<>% str_replace_all(pattern = "\\b\\w*\\d,*\\.*\\w*,*\\.*\\b", replacement = "num") # the real regex is \b\w*\d,*\.*\w*,*\.*\b
text %<>% str_replace_all(pattern = "\\b(num)\\W*\\S*(num)\\b", replacement = "num")
{for (i in (seq_along(contrac_repl)))
text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
}
# after the contractions, replace the any 's after the word
text %<>% str_replace_all("'s","")
return(text)
}
# word predict function ------
wordpredict <- function (phrase) {
# pass by the dictionary and contraction
phrase <- expand_contraction(phrase)
#splittext <- str_split(phrase," ",simplify = T)
splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
lengthtext = length(splittext)
backoff_index = ifelse(lengthtext>4,4,lengthtext)
# pick at maximum four words
if (lengthtext >1) {word_search_last = splittext[lengthtext]}
if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}
if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}
if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}
if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}
if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}
if (lengthtext <= 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}
chance_final <- rbind(if(exists("fivechance")){fivechance},
if(exists("fourchance")){fourchance},
if(exists("threechance")){threechance},
if(exists("bichance")){bichance},
if(exists("unichance")){unichance})
if(exists("fivechance")){rm(fivechance)}
if(exists("fourchance")){rm(fourchance)}
if(exists("threechance")){rm(threechance)}
if(exists("bichance")){rm(bichance)}
if(exists("unichance")){rm(unichance)}
ifelse(exists("chance_final"),
chance_final2 <- chance_final %>% group_by(selection) %>% arrange(desc(chance)) %>% top_n(1),
chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
return(list(splittext,chance_final2,lengthtext,backoff_index))
}
results <- reactive(wordpredict(input$text_input))
# remeber that you need to call like this 'results()'
output$raw <- renderText(input$text_input)
output$all <- renderPrint(results())
output$predict <- renderTable(results()[[2]])
output$converted <- renderText(results()[[1]])
} #the server
# User interface ---- ================================
ui <- basicPage(h1("Simple predictor"),
textInput("text_input", label = "Type here") ,
tags$strong("user input"),
textOutput("raw"),
tags$strong("words converted"),
textOutput("converted"),
#verbatimTextOutput("all"),
tags$strong("suggested words"),
tableOutput("predict")
) # the user interface
shinyApp(ui = ui, server = server) # this launches your app
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment