Created
June 24, 2025 19:04
-
-
Save sTeamTraen/a8c2a07fe51ca7e363cde70471652463 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Find words (in a dictionary file) that are made up of other words run together. | |
| # e.g.: "deliberating" = "deli" + "berating" | |
| # e.g.: "categorising" = "cat" + "ego" + "rising" | |
| # By Nick Brown (nicholasjlbrown@gmail.com), June 2025. | |
| # Some parameters. I hope their meaning is reasonably obvious. | |
| min.split.words <- 2 | |
| max.split.words <- 3 | |
| shortest.single.word <- 3 | |
| longest.single.word <- 8 | |
| shortest.compound.word <- 12 | |
| # My data file. Plain text, one word per line. | |
| filename <- "corncob_lowercase.txt" | |
| words <- read.table(filename)$V1 | |
| if (shortest.single.word <= 1) { | |
| words <- c(words, "a", "i") # these one-letter words aren't in my dictionary | |
| } | |
| words.bylen <- list(rep(NA, longest.single.word)) | |
| for (i in 1:longest.single.word) { | |
| words.bylen[[i]] <- words[nchar(words) == i] | |
| } | |
| # We are only interested in compound words with a certain minimum length. | |
| words <- words[(nchar(words) >= shortest.compound.word)] | |
| subwords <- function (word, level=1, start=1) { | |
| if (level > max.split.words) { | |
| return(NULL) | |
| } | |
| word.len <- nchar(word) | |
| for (len in shortest.single.word:min(longest.single.word, word.len - start + 1)) { | |
| end <- start + len - 1 | |
| part <- substr(word, start, end) | |
| if (nchar(part) < shortest.single.word) { | |
| return(NULL) | |
| } | |
| if (part %in% words.bylen[[len]]) { | |
| if (end == word.len) { | |
| if (level >= min.split.words) { | |
| return(list(part)) | |
| } | |
| } | |
| else { | |
| rest <- subwords(word, level + 1, end + 1) | |
| if (!is.null(rest)) { | |
| return(c(list(part), rest)) | |
| } | |
| } | |
| } | |
| } | |
| return(NULL) | |
| } | |
| for (i in 1:length(words)) { | |
| word <- words[i] | |
| sub <- subwords(word) | |
| if (!is.null(sub)) { | |
| cat("Found ", word, " (", sep="") | |
| for (i in 1:length(sub)) { | |
| cat(sub[[i]][1]) | |
| if (i < length(sub)) { | |
| cat(".") | |
| } | |
| else { | |
| cat(")\n") | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment