lindeloev/wordle_solver.R

## wordle_solver.R
# Use these functions to make smart guesses for Wordle (https://www.powerlanguage.co.uk/wordle/)
# find_word() returns words that satisfies the wordle feedback. Start with `possible_words`, i.e., all 5-letter english words.
# next_word() returns words that are most likely to result in green letters.
#
# A pretty good strategy on the next_word() output is to use "pathfinder" for the first two words and "guess" thereafter, picking the first commonly-known word.

#############
# FUNCTIONS #
#############

#' Detect the letter sequence that is likely to contain most green letters
#'
#' @param words A character vector of words
#' @param strategy
#'  * `"guess"`: only optimize for green letters.
#'  * `"learn"`: optimize for green AND yellow letters.
#' @param n How many top hits to show (from best to worse)
#' @return A vector of `n` words (best first)
next_word = function(words, strategy = "guess", n = 3) {
  # Get frequency of every letter at every position
  letter_pos_frequency = do.call(rbind, strsplit(words, "")) |>
    as.data.frame() |>
    lapply(table)

  # Score each word as the sum of words with letters in these positions
  df_scores = data.frame(word = words, score = 0)
  wordlength = unique(nchar(words))
  stopifnot("All words must have the same length" = length(wordlength) == 1)
  for (i in seq_len(wordlength)) {
    letter_i = substr(words, i, i)
    df_scores$score = df_scores$score + letter_pos_frequency[[i]][letter_i]

    # Also weight in yellow characters for "learn" strategy, i.e.,correct
    # characters in the wrong position.
    if (strategy == "learn") {
      other_letter_pos_frequency = paste0(substr(words, 1, i-1), substr(words, i+1, wordlength)) |>
        strsplit("") |>
        unlist() |>
        table()

      # Give identification of yellow letters half the info-weight of green letters.
      yellow_weight = 0.5 / wordlength
      df_scores$score = df_scores$score + other_letter_pos_frequency[letter_i] * yellow_weight
    }
  }

  # Return the best guess- and pathfinder words
  df_ordered = df_scores[order(-df_scores$score), ]

  if (strategy == "guess") {
    head(df_ordered$word, n)
  } else if (strategy == "learn") {
    only_unique_characters = df_ordered$word |>
      strsplit("") |>
      lapply(\(x) length(unique(x)) == length(x)) |>
      unlist()
    head(df_ordered$word[only_unique_characters], n)
  }
}


#' Find words that fulfill Wordle criteria
#'
#' @param words Vector of possible words at this step, e.g., `c("goats", "horse")`.
#' @param green Green characters in their correct position, e.g., `"s???e"`.
#'   Write ? where there are no green characters.
#' @param grey Gray characters, e.g., `"car"`
#' @param yellows Yellow characters in their correct position, e.g., `c("???es", "??i??")`.
#' @return A vector of words
find_words = function(words, green = "?????", grey = "", yellows = c()) {
  # GREEN: Keep words matching green letters in their position
  regex_green = paste0("^", gsub("?", "[a-z]", tolower(green), fixed = TRUE), "$")
  words_remaining = words[grepl(regex_green, words)]

  # GREY: Remove words with grey letters
  if (nchar(grey) > 0) {
    grey_regex = gsub("(?<=.)(?=.)", "|", tolower(grey), perl = TRUE)  # split characters by |
    words_remaining = words_remaining[!grepl(grey_regex, words_remaining)]
  }

  # YELLOW
  for (yellow in yellows) {
    letters_i = strsplit(yellow, "") |> unlist()

    for (letter in letters_i[letters_i != "?"]) {
      # Yellow letter must not be in the entered position
      letter_position = which(letters_i == letter)
      illegal_words = substr(words_remaining, letter_position, letter_position) == letter
      words_remaining = words_remaining[!illegal_words]

      # Yellow letter must be present
      legal_words = grepl(letter, words_remaining)
      words_remaining = words_remaining[legal_words]

    }
  }

  words_remaining
}


############
# APPLY IT #
############
# Vector of all English words
all_words = read.csv("https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt", header = FALSE, col.names = "word")$word
possible_words = all_words[nchar(all_words) == 5]

# 2022-01-01: Third guess
words = find_words(possible_words)
next_word(words, "learn")  # I guess CARES

words = find_words(words, green = "????s", grey = "ca", yellows = c("??re?"))
next_word(words, "learn")  # I guess TIERS

words = find_words(words, green = "????s", grey = "cati", yellows = c("??re?", "??er?"))
next_word(words, "guess", n = 100)  # I guess REBUS


# 2022-01-02: Third guess
words = find_words(possible_words)
next_word(words, "learn")  # I guess CARES

words = find_words(words, grey = "care", yellows = c("????s"))
next_word(words, "learn")  # I guess SOILY

words = find_words(words, green = "?o???", grey = "careily", yellows = c("????s", "s????"))
next_word(words, "guess", n = 100)  # I guess BOOST


# 2022-01-03: Fourth guess
words = find_words(possible_words)
next_word(words, "learn")  # I guess CARES

words = find_words(words, green = "????s", grey = "cae", yellows = c("??r??"))
next_word(words, "learn")  # I guess GROTS

words = find_words(words, green = "?r??s", grey = "caego", yellows = c("??r??", "???t?"))
next_word(words, "guess", n = 100)  # I guess TRIMS

words = find_words(words, green = "tr??s", grey = "caegoim", yellows = c("??r??", "???t?"))
next_word(words, "guess", n = 100)  # I guess TRUSS


# 2022-01-04: Third guess
words = find_words(possible_words)
next_word(words, "learn")  # I guess CARES

words = find_words(words, grey = "car", yellows = c("???es"))
next_word(words, "learn")  # I guess STILE

words = find_words(words, green = "s???e", grey = "cartl", yellows = c("???es", "??i??"))
next_word(words, "guess", n = 100)  # I guess SIEGE


# 2022-01-05: Third guess
words = find_words(possible_words)
next_word(words, "learn")  # I guess CARES

words = find_words(words, green = "???e?", grey = "cas", yellow = c("??r??"))
next_word(words, "learn")  # DOTER

words = find_words(words, green = "???er", grey = "casdo", yellow = c("??r??", "??t??"))
next_word(words, "guess", n = 100)  # TIGER


# 2022-01-06: third guess
words = find_words(possible_words)
next_word(words, "learn")  # CARES

words = find_words(words, green = "?a???", grey = "cres")
next_word(words, "learn")  # MANLY

words = find_words(words, green = "?an??", grey = "cresmy", yellow = c("???l?"))
next_word(words, "guess", n = 100)  # BANAL
	# Use these functions to make smart guesses for Wordle (https://www.powerlanguage.co.uk/wordle/)
	# find_word() returns words that satisfies the wordle feedback. Start with `possible_words`, i.e., all 5-letter english words.
	# next_word() returns words that are most likely to result in green letters.
	#
	# A pretty good strategy on the next_word() output is to use "pathfinder" for the first two words and "guess" thereafter, picking the first commonly-known word.

	#############
	# FUNCTIONS #
	#############

	#' Detect the letter sequence that is likely to contain most green letters
	#'
	#' @param words A character vector of words
	#' @param strategy
	#' * `"guess"`: only optimize for green letters.
	#' * `"learn"`: optimize for green AND yellow letters.
	#' @param n How many top hits to show (from best to worse)
	#' @return A vector of `n` words (best first)
	next_word = function(words, strategy = "guess", n = 3) {
	# Get frequency of every letter at every position
	letter_pos_frequency = do.call(rbind, strsplit(words, "")) \|>
	as.data.frame() \|>
	lapply(table)

	# Score each word as the sum of words with letters in these positions
	df_scores = data.frame(word = words, score = 0)
	wordlength = unique(nchar(words))
	stopifnot("All words must have the same length" = length(wordlength) == 1)
	for (i in seq_len(wordlength)) {
	letter_i = substr(words, i, i)
	df_scores$score = df_scores$score + letter_pos_frequency[[i]][letter_i]

	# Also weight in yellow characters for "learn" strategy, i.e.,correct
	# characters in the wrong position.
	if (strategy == "learn") {
	other_letter_pos_frequency = paste0(substr(words, 1, i-1), substr(words, i+1, wordlength)) \|>
	strsplit("") \|>
	unlist() \|>
	table()

	# Give identification of yellow letters half the info-weight of green letters.
	yellow_weight = 0.5 / wordlength
	df_scores$score = df_scores$score + other_letter_pos_frequency[letter_i] * yellow_weight
	}
	}

	# Return the best guess- and pathfinder words
	df_ordered = df_scores[order(-df_scores$score), ]

	if (strategy == "guess") {
	head(df_ordered$word, n)
	} else if (strategy == "learn") {
	only_unique_characters = df_ordered$word \|>
	strsplit("") \|>
	lapply(\(x) length(unique(x)) == length(x)) \|>
	unlist()
	head(df_ordered$word[only_unique_characters], n)
	}
	}


	#' Find words that fulfill Wordle criteria
	#'
	#' @param words Vector of possible words at this step, e.g., `c("goats", "horse")`.
	#' @param green Green characters in their correct position, e.g., `"s???e"`.
	#' Write ? where there are no green characters.
	#' @param grey Gray characters, e.g., `"car"`
	#' @param yellows Yellow characters in their correct position, e.g., `c("???es", "??i??")`.
	#' @return A vector of words
	find_words = function(words, green = "?????", grey = "", yellows = c()) {
	# GREEN: Keep words matching green letters in their position
	regex_green = paste0("^", gsub("?", "[a-z]", tolower(green), fixed = TRUE), "$")
	words_remaining = words[grepl(regex_green, words)]

	# GREY: Remove words with grey letters
	if (nchar(grey) > 0) {
	grey_regex = gsub("(?<=.)(?=.)", "\|", tolower(grey), perl = TRUE) # split characters by \|
	words_remaining = words_remaining[!grepl(grey_regex, words_remaining)]
	}

	# YELLOW
	for (yellow in yellows) {
	letters_i = strsplit(yellow, "") \|> unlist()

	for (letter in letters_i[letters_i != "?"]) {
	# Yellow letter must not be in the entered position
	letter_position = which(letters_i == letter)
	illegal_words = substr(words_remaining, letter_position, letter_position) == letter
	words_remaining = words_remaining[!illegal_words]

	# Yellow letter must be present
	legal_words = grepl(letter, words_remaining)
	words_remaining = words_remaining[legal_words]

	}
	}

	words_remaining
	}



	############
	# APPLY IT #
	############
	# Vector of all English words
	all_words = read.csv("https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt", header = FALSE, col.names = "word")$word
	possible_words = all_words[nchar(all_words) == 5]

	# 2022-01-01: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, green = "????s", grey = "ca", yellows = c("??re?"))
	next_word(words, "learn") # I guess TIERS

	words = find_words(words, green = "????s", grey = "cati", yellows = c("??re?", "??er?"))
	next_word(words, "guess", n = 100) # I guess REBUS


	# 2022-01-02: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, grey = "care", yellows = c("????s"))
	next_word(words, "learn") # I guess SOILY

	words = find_words(words, green = "?o???", grey = "careily", yellows = c("????s", "s????"))
	next_word(words, "guess", n = 100) # I guess BOOST


	# 2022-01-03: Fourth guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, green = "????s", grey = "cae", yellows = c("??r??"))
	next_word(words, "learn") # I guess GROTS

	words = find_words(words, green = "?r??s", grey = "caego", yellows = c("??r??", "???t?"))
	next_word(words, "guess", n = 100) # I guess TRIMS

	words = find_words(words, green = "tr??s", grey = "caegoim", yellows = c("??r??", "???t?"))
	next_word(words, "guess", n = 100) # I guess TRUSS


	# 2022-01-04: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, grey = "car", yellows = c("???es"))
	next_word(words, "learn") # I guess STILE

	words = find_words(words, green = "s???e", grey = "cartl", yellows = c("???es", "??i??"))
	next_word(words, "guess", n = 100) # I guess SIEGE


	# 2022-01-05: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, green = "???e?", grey = "cas", yellow = c("??r??"))
	next_word(words, "learn") # DOTER

	words = find_words(words, green = "???er", grey = "casdo", yellow = c("??r??", "??t??"))
	next_word(words, "guess", n = 100) # TIGER


	# 2022-01-06: third guess
	words = find_words(possible_words)
	next_word(words, "learn") # CARES

	words = find_words(words, green = "?a???", grey = "cres")
	next_word(words, "learn") # MANLY

	words = find_words(words, green = "?an??", grey = "cresmy", yellow = c("???l?"))
	next_word(words, "guess", n = 100) # BANAL