Skip to content

Instantly share code, notes, and snippets.

@sTeamTraen
Created June 24, 2025 19:04
Show Gist options
  • Select an option

  • Save sTeamTraen/a8c2a07fe51ca7e363cde70471652463 to your computer and use it in GitHub Desktop.

Select an option

Save sTeamTraen/a8c2a07fe51ca7e363cde70471652463 to your computer and use it in GitHub Desktop.
# Find words (in a dictionary file) that are made up of other words run together.
# e.g.: "deliberating" = "deli" + "berating"
# e.g.: "categorising" = "cat" + "ego" + "rising"
# By Nick Brown (nicholasjlbrown@gmail.com), June 2025.
# Some parameters. I hope their meaning is reasonably obvious.
min.split.words <- 2
max.split.words <- 3
shortest.single.word <- 3
longest.single.word <- 8
shortest.compound.word <- 12
# My data file. Plain text, one word per line.
filename <- "corncob_lowercase.txt"
words <- read.table(filename)$V1
if (shortest.single.word <= 1) {
words <- c(words, "a", "i") # these one-letter words aren't in my dictionary
}
words.bylen <- list(rep(NA, longest.single.word))
for (i in 1:longest.single.word) {
words.bylen[[i]] <- words[nchar(words) == i]
}
# We are only interested in compound words with a certain minimum length.
words <- words[(nchar(words) >= shortest.compound.word)]
subwords <- function (word, level=1, start=1) {
if (level > max.split.words) {
return(NULL)
}
word.len <- nchar(word)
for (len in shortest.single.word:min(longest.single.word, word.len - start + 1)) {
end <- start + len - 1
part <- substr(word, start, end)
if (nchar(part) < shortest.single.word) {
return(NULL)
}
if (part %in% words.bylen[[len]]) {
if (end == word.len) {
if (level >= min.split.words) {
return(list(part))
}
}
else {
rest <- subwords(word, level + 1, end + 1)
if (!is.null(rest)) {
return(c(list(part), rest))
}
}
}
}
return(NULL)
}
for (i in 1:length(words)) {
word <- words[i]
sub <- subwords(word)
if (!is.null(sub)) {
cat("Found ", word, " (", sep="")
for (i in 1:length(sub)) {
cat(sub[[i]][1])
if (i < length(sub)) {
cat(".")
}
else {
cat(")\n")
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment