Skip to content

Instantly share code, notes, and snippets.

@dacr
Last active April 2, 2023 10:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dacr/6292d44b720782353d0d6c2ab0bdf99c to your computer and use it in GitHub Desktop.
Save dacr/6292d44b720782353d0d6c2ab0bdf99c to your computer and use it in GitHub Desktop.
french hunspell from grammalecte / published by https://github.com/dacr/code-examples-manager #5c8a6863-c517-4c5f-bf7e-540b6d479c91/f7360eece359d20f5d10a74c498b31e7294a8413
// summary : french hunspell from grammalecte
// keywords : scala, zio, sttp, nio, words, spell, parse, unzip, zip, files, hunspell, @testable
// publish : gist
// authors : David Crosson
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
// id : 5c8a6863-c517-4c5f-bf7e-540b6d479c91
// created-on : 2022-01-23T10:00:00Z
// managed-by : https://github.com/dacr/code-examples-manager
// run-with : scala-cli $file
// ---------------------
//> using scala "3.2.2"
//> using dep "dev.zio::zio:2.0.0"
//> using dep "dev.zio::zio-json:0.3.0-RC10"
//> using dep "com.softwaremill.sttp.client3::zio:3.7.0"
//> using dep "dev.zio::zio-nio:2.0.0"
// ---------------------
import zio.*
import zio.json.*
import zio.nio.*
import zio.nio.file.*
import sttp.client3.*, sttp.client3.basicRequest.*
import sttp.client3.httpclient.zio.HttpClientZioBackend
import zio.nio.charset.Charset
import java.io.ByteArrayInputStream
import java.nio.file.StandardOpenOption.*
import java.util.zip.*
object WordleDic extends ZIOAppDefault {
val destDicoZipFile = Path("dico.zip")
val dicEntryKey = "fr-classique"
def downloadLogic(dest: Path) = {
for {
backend <- HttpClientZioBackend()
src = uri"http://grammalecte.net/download/fr/hunspell-french-dictionaries-v7.0.zip"
request = basicRequest.get(src)
response <- backend.send(request.response(asByteArray))
body <- ZIO.fromEither(response.body)
_ <- Files.writeBytes(dest, Chunk.fromArray(body), TRUNCATE_EXISTING, CREATE)
_ <- Console.printLine(s"wrote to $dest")
} yield ()
}
def extractZipEntryContentAsBytes(zipInputStream: ZipInputStream, entryName: String) = {
LazyList
.continually(zipInputStream.getNextEntry)
.takeWhile(_ != null)
.find(entry => entry.getName == entryName)
.map(entry => Chunk.fromArray(zipInputStream.readAllBytes()))
}
def zipExtractLogic(src: Path, entryName: String) = {
for {
zipContent <- Files.readAllBytes(src)
bytes = ByteArrayInputStream(zipContent.toArray)
// TODO
// zipInputStream <- ZManaged.acquireReleaseAttemptWith(ZipInputStream(bytes))(_.close()).useNow
zipInputStream <- ZIO.succeed(ZipInputStream(bytes))
_ <- Console.printLine(s"searching for $entryName")
bytes <- ZIO.attemptBlockingIO(extractZipEntryContentAsBytes(zipInputStream, entryName)).some
} yield bytes
}
case class HunspellEntry(word: String, flags: Option[String], attributes: Map[String, String]) {
val isDiv = attributes.get("po") == Some("div") // Separator
val isCommun = word.head.isLower // Nom commun
val isCompound = word.contains("-")
val isPropre = attributes.get("po") == Some("npr")
val isFirstName = attributes.get("po") == Some("prn")
}
object HunspellEntry {
def fromLine(line: String): Option[HunspellEntry] = {
val parts = line.trim().split("""\s+""").toList
val attributes =
parts
.drop(1)
.map(_.split(":", 2))
.collect { case Array(key, value) => key -> value }
.toMap
parts.headOption.getOrElse("").split("/", 2) match {
case Array(word) => Some(HunspellEntry(word, None, attributes))
case Array(word, flags) => Some(HunspellEntry(word, Some(flags), attributes))
case _ => None
}
}
}
case class Hunspell(entries: List[HunspellEntry])
def parseHunspell(dicBytes: Chunk[Byte], affixBytes: Chunk[Byte]) = {
val charset = Charset.Standard.utf8
for {
content <- charset.decodeString(dicBytes)
lines = content.split("\n").toList
count <- ZIO.fromOption(lines.headOption.map(_.toInt))
_ <- Console.printLine(s"Expecting to find $count hunspell entries")
specs = lines.tail
entries = specs.flatMap(HunspellEntry.fromLine)
_ <- Console.printLine(s"Found ${entries.size} hunspell entries")
// hunspell <- ZIO.cond(entries.size == count, Hunspell(entries), Error("Didn't find the right number of words in dictionary"))
hunspell = Hunspell(entries) // No check as count input data looks invalid :(
} yield hunspell
}
def dumpStats(hunspell: Hunspell) = {
val selectedWords = hunspell.entries.filter(entry => entry.isCommun && !entry.isCompound)
val wordsBySize = selectedWords.groupBy(_.word.size)
val countBySize = wordsBySize.view.mapValues(_.size).toMap
val longestWords = wordsBySize.get(wordsBySize.keys.max).getOrElse(Nil).map(_.word)
for {
_ <- Console.printLine(s"For common & not compound words")
_ <- Console.printLine(s" Found ${selectedWords.size} words")
_ <- Console.printLine(s" Number of common & not compound words By size")
_ <- Console.printLine(countBySize.toList.sorted.mkString(" ", "\n ", "\n"))
_ <- Console.printLine(s" Longest french words")
_ <- Console.printLine(longestWords.sorted.mkString(" ", "\n ", "\n"))
_ <- Console.printLine(s"Found ${hunspell.entries.size} words in the dictionary")
} yield ()
}
def naiveSearch(hunspell: Hunspell, pattern: String, excludedLetters: String = "", includedLetters: String = ""): List[String] = {
def normalize(word: String): String =
word.toLowerCase
.replaceAll("[áàäâ]", "a")
.replaceAll("[éèëê]", "e")
.replaceAll("[íìïî]", "i")
.replaceAll("[óòöô]", "o")
.replaceAll("[úùüû]", "u")
.replaceAll("[ç]", "c")
.toUpperCase
val wordRE = pattern.replaceAll("_", ".").r
hunspell.entries.view
.filterNot(_.isCompound)
.filter(_.isCommun)
.map(_.word)
.filter(_.size == pattern.size)
.map(normalize)
.filter(wordRE.matches)
.filterNot(_.exists(excludedLetters.contains))
.filter(word => includedLetters.forall(word.contains))
.toList
}
override def run = for {
_ <- if (!destDicoZipFile.toFile.exists()) downloadLogic(destDicoZipFile) else ZIO.succeed(())
affBytes <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.aff")
dicBytes <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.dic")
hunspell <- parseHunspell(dicBytes, affBytes)
_ <- dumpStats(hunspell)
givenToSolve <- getArgs.map(_.headOption)
givenIncludedLetters <- getArgs.map(_.drop(1).headOption)
givenExcludedLetters <- getArgs.map(_.drop(2).headOption)
toSolve = givenToSolve.map(_.toUpperCase).getOrElse("S_______")
excludedLetters = givenExcludedLetters.map(_.toUpperCase).getOrElse("")
includedLetters = givenIncludedLetters.map(_.toUpperCase).getOrElse("")
solveOri = toSolve.head + toSolve.tail.map(_ => '_')
_ <- Console.printLine(s"Possible solution count ${naiveSearch(hunspell, solveOri).size} for $solveOri")
_ <- Console.printLine(s"Candidate solutions for $toSolve while including '$includedLetters' and excluding '$excludedLetters'")
_ <- Console.printLine(naiveSearch(hunspell, toSolve, excludedLetters, includedLetters).mkString(" "))
} yield ()
}
WordleDic.main(Array.empty)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment