Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Last active January 6, 2017 22:00
Show Gist options
  • Save MichaelChirico/dfca231640616b5e617a5b39265b4d09 to your computer and use it in GitHub Desktop.
Save MichaelChirico/dfca231640616b5e617a5b39265b4d09 to your computer and use it in GitHub Desktop.
library(data.table)
dict.orig = tolower(readLines("/usr/share/dict/american-english"))
#words shorter than the longest padded with "" for simpler retrieval
dictDT = setDT(tstrsplit(dict.orig, split = "", fill = ""))
#lookup table for conversion
lookup = data.table(num = c(rep(2L, 3), rep(3L, 3), rep(4L, 3),
rep(5L, 3), rep(6L, 3), rep(7L, 4),
rep(8L, 3), rep(9L, 4)),
let = letters)
#the following are found in the dictionary and would need
# to be handled separately (accents should just be
# appended to matches for unaccented version):
# c("", "'", "á", "â", "å", "ä",
# "ç", "é", "è", "ê", "í", "ñ",
# "ó", "ô", "ö", "û", "ü")
lookup[ , num := paste0(num)]
for (col in names(dictDT)) {
dictDT[lookup, (col) := i.num, on = setNames("let", col)]
}
#back to character vector
dict.num = do.call(paste0, dictDT)
#sorting for faster vector search
idx = order(dict.num)
dict.num = dict.num[idx]
dict.orig = dict.orig[idx]
possibilities = function(input) dict.orig[dict.num == input]
#lapply for multiple inputs
lapply(c("43556", "469", "47", "48", "46464", "3637", "8447", "22882559", "9675", "67", "9428"), possibilities)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment