marceloszilagyi/Predictor.R

## Predictor.R
# load RDS and libraries
load("20170724.RData")

# this is function to predict only

library("tidyverse")
library("stringr")
library("textclean")
library("lexicon")
library("magrittr")
library(shiny)
library(stringi)


# Server ------ ============================================

server <- function(input, output, session) {

  # Function to expand contractions and convert numbers to "num" -----
  expand_contraction = function (text) {
    text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
    text %<>% tolower()
    text %<>% str_replace_all(pattern = "\\b\\w*\\d,*\\.*\\w*,*\\.*\\b", replacement = "num") # the real regex is \b\w*\d,*\.*\w*,*\.*\b
    text %<>% str_replace_all(pattern = "\\b(num)\\W*\\S*(num)\\b", replacement = "num")
    {for (i in (seq_along(contrac_repl)))
      text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
    }
    # after the contractions, replace the any 's after the word
    text %<>% str_replace_all("'s","")
    return(text)
  }

  # word predict function ------

  wordpredict <- function (phrase) {
    # pass by the dictionary and contraction
    phrase <- expand_contraction(phrase)
    #splittext <- str_split(phrase," ",simplify = T)
    splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
    splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
    splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
    lengthtext = length(splittext)
    backoff_index = ifelse(lengthtext>4,4,lengthtext)
    # pick at maximum four words
    if (lengthtext >1) {word_search_last = splittext[lengthtext]}
    if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
    if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
    if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}

    if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}

    if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}

    if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}

    if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}

    if (lengthtext <= 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}


    chance_final <- rbind(if(exists("fivechance")){fivechance},
                          if(exists("fourchance")){fourchance},
                          if(exists("threechance")){threechance},
                          if(exists("bichance")){bichance},
                          if(exists("unichance")){unichance})

    if(exists("fivechance")){rm(fivechance)}
    if(exists("fourchance")){rm(fourchance)}
    if(exists("threechance")){rm(threechance)}
    if(exists("bichance")){rm(bichance)}
    if(exists("unichance")){rm(unichance)}

    ifelse(exists("chance_final"),
               chance_final2 <- chance_final %>% group_by(selection) %>% arrange(desc(chance)) %>% top_n(1),
               chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
    return(list(splittext,chance_final2,lengthtext,backoff_index))
  }

  results <- reactive(wordpredict(input$text_input))
  # remeber that you need to call like this 'results()'
  output$raw <- renderText(input$text_input)
  output$all <- renderPrint(results())
  output$predict <- renderTable(results()[[2]])
  output$converted <- renderText(results()[[1]])


  } #the server

# User interface ---- ================================

ui <- basicPage(h1("Simple predictor"),
                textInput("text_input", label = "Type here") ,
                tags$strong("user input"),
                textOutput("raw"),
                tags$strong("words converted"),
                textOutput("converted"),
                #verbatimTextOutput("all"),
                tags$strong("suggested words"),
                tableOutput("predict")
                              ) # the user interface


shinyApp(ui = ui, server = server) # this launches your app
	# load RDS and libraries
	load("20170724.RData")

	# this is function to predict only

	library("tidyverse")
	library("stringr")
	library("textclean")
	library("lexicon")
	library("magrittr")
	library(shiny)
	library(stringi)


	# Server ------ ============================================

	server <- function(input, output, session) {

	# Function to expand contractions and convert numbers to "num" -----
	expand_contraction = function (text) {
	text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
	text %<>% tolower()
	text %<>% str_replace_all(pattern = "\\b\\w\\d,\\.\\w,\\.\\b", replacement = "num") # the real regex is \b\w\d,\.\w,\.\b
	text %<>% str_replace_all(pattern = "\\b(num)\\W\\S(num)\\b", replacement = "num")
	{for (i in (seq_along(contrac_repl)))
	text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
	}
	# after the contractions, replace the any 's after the word
	text %<>% str_replace_all("'s","")
	return(text)
	}

	# word predict function ------

	wordpredict <- function (phrase) {
	# pass by the dictionary and contraction
	phrase <- expand_contraction(phrase)
	#splittext <- str_split(phrase," ",simplify = T)
	splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
	splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
	splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
	lengthtext = length(splittext)
	backoff_index = ifelse(lengthtext>4,4,lengthtext)
	# pick at maximum four words
	if (lengthtext >1) {word_search_last = splittext[lengthtext]}
	if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
	if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
	if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}

	if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}

	if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}

	if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}

	if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}

	if (lengthtext <= 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}


	chance_final <- rbind(if(exists("fivechance")){fivechance},
	if(exists("fourchance")){fourchance},
	if(exists("threechance")){threechance},
	if(exists("bichance")){bichance},
	if(exists("unichance")){unichance})

	if(exists("fivechance")){rm(fivechance)}
	if(exists("fourchance")){rm(fourchance)}
	if(exists("threechance")){rm(threechance)}
	if(exists("bichance")){rm(bichance)}
	if(exists("unichance")){rm(unichance)}

	ifelse(exists("chance_final"),
	chance_final2 <- chance_final %>% group_by(selection) %>% arrange(desc(chance)) %>% top_n(1),
	chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
	return(list(splittext,chance_final2,lengthtext,backoff_index))
	}

	results <- reactive(wordpredict(input$text_input))
	# remeber that you need to call like this 'results()'
	output$raw <- renderText(input$text_input)
	output$all <- renderPrint(results())
	output$predict <- renderTable(results()[[2]])
	output$converted <- renderText(results()[[1]])


	} #the server

	# User interface ---- ================================

	ui <- basicPage(h1("Simple predictor"),
	textInput("text_input", label = "Type here") ,
	tags$strong("user input"),
	textOutput("raw"),
	tags$strong("words converted"),
	textOutput("converted"),
	#verbatimTextOutput("all"),
	tags$strong("suggested words"),
	tableOutput("predict")
	) # the user interface


	shinyApp(ui = ui, server = server) # this launches your app