dacr/grammalecte-hunspell.sc

## grammalecte-hunspell.sc
// summary : french hunspell from grammalecte
// keywords : scala, zio, sttp, nio, words, spell, parse, unzip, zip, files, hunspell, @testable
// publish : gist
// authors : David Crosson
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
// id : 5c8a6863-c517-4c5f-bf7e-540b6d479c91
// created-on : 2022-01-23T10:00:00Z
// managed-by : https://github.com/dacr/code-examples-manager
// run-with : scala-cli $file

// ---------------------
//> using scala "3.4.2"
//> using dep "dev.zio::zio:2.0.0"
//> using dep "dev.zio::zio-json:0.3.0-RC10"
//> using dep "com.softwaremill.sttp.client3::zio:3.7.0"
//> using dep "dev.zio::zio-nio:2.0.0"
// ---------------------

import zio.*
import zio.json.*
import zio.nio.*
import zio.nio.file.*
import sttp.client3.*, sttp.client3.basicRequest.*
import sttp.client3.httpclient.zio.HttpClientZioBackend
import zio.nio.charset.Charset

import java.io.ByteArrayInputStream
import java.nio.file.StandardOpenOption.*
import java.util.zip.*

object WordleDic extends ZIOAppDefault {

  val destDicoZipFile = Path("dico.zip")
  val dicEntryKey     = "fr-classique"

  def downloadLogic(dest: Path) = {
    for {
      backend  <- HttpClientZioBackend()
      src       = uri"http://grammalecte.net/download/fr/hunspell-french-dictionaries-v7.0.zip"
      request   = basicRequest.get(src)
      response <- backend.send(request.response(asByteArray))
      body     <- ZIO.fromEither(response.body)
      _        <- Files.writeBytes(dest, Chunk.fromArray(body), TRUNCATE_EXISTING, CREATE)
      _        <- Console.printLine(s"wrote to $dest")
    } yield ()
  }

  def extractZipEntryContentAsBytes(zipInputStream: ZipInputStream, entryName: String) = {
    LazyList
      .continually(zipInputStream.getNextEntry)
      .takeWhile(_ != null)
      .find(entry => entry.getName == entryName)
      .map(entry => Chunk.fromArray(zipInputStream.readAllBytes()))
  }

  def zipExtractLogic(src: Path, entryName: String) = {
    for {
      zipContent     <- Files.readAllBytes(src)
      bytes           = ByteArrayInputStream(zipContent.toArray)
      // TODO
      // zipInputStream <- ZManaged.acquireReleaseAttemptWith(ZipInputStream(bytes))(_.close()).useNow
      zipInputStream <- ZIO.succeed(ZipInputStream(bytes))
      _              <- Console.printLine(s"searching for $entryName")
      bytes          <- ZIO.attemptBlockingIO(extractZipEntryContentAsBytes(zipInputStream, entryName)).some
    } yield bytes
  }

  case class HunspellEntry(word: String, flags: Option[String], attributes: Map[String, String]) {
    val isDiv       = attributes.get("po") == Some("div") // Separator
    val isCommun    = word.head.isLower                   // Nom commun
    val isCompound  = word.contains("-")
    val isPropre    = attributes.get("po") == Some("npr")
    val isFirstName = attributes.get("po") == Some("prn")
  }

  object HunspellEntry {
    def fromLine(line: String): Option[HunspellEntry] = {
      val parts      = line.trim().split("""\s+""").toList
      val attributes =
        parts
          .drop(1)
          .map(_.split(":", 2))
          .collect { case Array(key, value) => key -> value }
          .toMap
      parts.headOption.getOrElse("").split("/", 2) match {
        case Array(word)        => Some(HunspellEntry(word, None, attributes))
        case Array(word, flags) => Some(HunspellEntry(word, Some(flags), attributes))
        case _                  => None
      }
    }
  }

  case class Hunspell(entries: List[HunspellEntry])

  def parseHunspell(dicBytes: Chunk[Byte], affixBytes: Chunk[Byte]) = {
    val charset = Charset.Standard.utf8
    for {
      content <- charset.decodeString(dicBytes)
      lines    = content.split("\n").toList
      count   <- ZIO.fromOption(lines.headOption.map(_.toInt))
      _       <- Console.printLine(s"Expecting to find $count hunspell entries")
      specs    = lines.tail
      entries  = specs.flatMap(HunspellEntry.fromLine)
      _       <- Console.printLine(s"Found ${entries.size} hunspell entries")
      // hunspell <- ZIO.cond(entries.size == count, Hunspell(entries), Error("Didn't find the right number of words in dictionary"))
      hunspell = Hunspell(entries) // No check as count input data looks invalid :(
    } yield hunspell
  }

  def dumpStats(hunspell: Hunspell) = {
    val selectedWords = hunspell.entries.filter(entry => entry.isCommun && !entry.isCompound)
    val wordsBySize   = selectedWords.groupBy(_.word.size)
    val countBySize   = wordsBySize.view.mapValues(_.size).toMap
    val longestWords  = wordsBySize.get(wordsBySize.keys.max).getOrElse(Nil).map(_.word)
    for {
      _ <- Console.printLine(s"For common & not compound words")
      _ <- Console.printLine(s"  Found ${selectedWords.size} words")
      _ <- Console.printLine(s"  Number of common & not compound words By size")
      _ <- Console.printLine(countBySize.toList.sorted.mkString("    ", "\n    ", "\n"))
      _ <- Console.printLine(s"  Longest french words")
      _ <- Console.printLine(longestWords.sorted.mkString("    ", "\n    ", "\n"))
      _ <- Console.printLine(s"Found ${hunspell.entries.size} words in the dictionary")
    } yield ()
  }

  def naiveSearch(hunspell: Hunspell, pattern: String, excludedLetters: String = "", includedLetters: String = ""): List[String] = {
    def normalize(word: String): String =
      word.toLowerCase
        .replaceAll("[áàäâ]", "a")
        .replaceAll("[éèëê]", "e")
        .replaceAll("[íìïî]", "i")
        .replaceAll("[óòöô]", "o")
        .replaceAll("[úùüû]", "u")
        .replaceAll("[ç]", "c")
        .toUpperCase

    val wordRE = pattern.replaceAll("_", ".").r
    hunspell.entries.view
      .filterNot(_.isCompound)
      .filter(_.isCommun)
      .map(_.word)
      .filter(_.size == pattern.size)
      .map(normalize)
      .filter(wordRE.matches)
      .filterNot(_.exists(excludedLetters.contains))
      .filter(word => includedLetters.forall(word.contains))
      .toList
  }

  override def run = for {
    _                    <- if (!destDicoZipFile.toFile.exists()) downloadLogic(destDicoZipFile) else ZIO.succeed(())
    affBytes             <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.aff")
    dicBytes             <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.dic")
    hunspell             <- parseHunspell(dicBytes, affBytes)
    _                    <- dumpStats(hunspell)
    givenToSolve         <- getArgs.map(_.headOption)
    givenIncludedLetters <- getArgs.map(_.drop(1).headOption)
    givenExcludedLetters <- getArgs.map(_.drop(2).headOption)
    toSolve               = givenToSolve.map(_.toUpperCase).getOrElse("S_______")
    excludedLetters       = givenExcludedLetters.map(_.toUpperCase).getOrElse("")
    includedLetters       = givenIncludedLetters.map(_.toUpperCase).getOrElse("")
    solveOri              = toSolve.head + toSolve.tail.map(_ => '_')
    _                    <- Console.printLine(s"Possible solution count ${naiveSearch(hunspell, solveOri).size} for $solveOri")
    _                    <- Console.printLine(s"Candidate solutions for $toSolve while including '$includedLetters' and excluding '$excludedLetters'")
    _                    <- Console.printLine(naiveSearch(hunspell, toSolve, excludedLetters, includedLetters).mkString(" "))
  } yield ()

}

WordleDic.main(Array.empty)
	// summary : french hunspell from grammalecte
	// keywords : scala, zio, sttp, nio, words, spell, parse, unzip, zip, files, hunspell, @testable
	// publish : gist
	// authors : David Crosson
	// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
	// id : 5c8a6863-c517-4c5f-bf7e-540b6d479c91
	// created-on : 2022-01-23T10:00:00Z
	// managed-by : https://github.com/dacr/code-examples-manager
	// run-with : scala-cli $file

	// ---------------------
	//> using scala "3.4.2"
	//> using dep "dev.zio::zio:2.0.0"
	//> using dep "dev.zio::zio-json:0.3.0-RC10"
	//> using dep "com.softwaremill.sttp.client3::zio:3.7.0"
	//> using dep "dev.zio::zio-nio:2.0.0"
	// ---------------------

	import zio.*
	import zio.json.*
	import zio.nio.*
	import zio.nio.file.*
	import sttp.client3., sttp.client3.basicRequest.
	import sttp.client3.httpclient.zio.HttpClientZioBackend
	import zio.nio.charset.Charset

	import java.io.ByteArrayInputStream
	import java.nio.file.StandardOpenOption.*
	import java.util.zip.*

	object WordleDic extends ZIOAppDefault {

	val destDicoZipFile = Path("dico.zip")
	val dicEntryKey = "fr-classique"

	def downloadLogic(dest: Path) = {
	for {
	backend <- HttpClientZioBackend()
	src = uri"http://grammalecte.net/download/fr/hunspell-french-dictionaries-v7.0.zip"
	request = basicRequest.get(src)
	response <- backend.send(request.response(asByteArray))
	body <- ZIO.fromEither(response.body)
	_ <- Files.writeBytes(dest, Chunk.fromArray(body), TRUNCATE_EXISTING, CREATE)
	_ <- Console.printLine(s"wrote to $dest")
	} yield ()
	}

	def extractZipEntryContentAsBytes(zipInputStream: ZipInputStream, entryName: String) = {
	LazyList
	.continually(zipInputStream.getNextEntry)
	.takeWhile(_ != null)
	.find(entry => entry.getName == entryName)
	.map(entry => Chunk.fromArray(zipInputStream.readAllBytes()))
	}

	def zipExtractLogic(src: Path, entryName: String) = {
	for {
	zipContent <- Files.readAllBytes(src)
	bytes = ByteArrayInputStream(zipContent.toArray)
	// TODO
	// zipInputStream <- ZManaged.acquireReleaseAttemptWith(ZipInputStream(bytes))(_.close()).useNow
	zipInputStream <- ZIO.succeed(ZipInputStream(bytes))
	_ <- Console.printLine(s"searching for $entryName")
	bytes <- ZIO.attemptBlockingIO(extractZipEntryContentAsBytes(zipInputStream, entryName)).some
	} yield bytes
	}

	case class HunspellEntry(word: String, flags: Option[String], attributes: Map[String, String]) {
	val isDiv = attributes.get("po") == Some("div") // Separator
	val isCommun = word.head.isLower // Nom commun
	val isCompound = word.contains("-")
	val isPropre = attributes.get("po") == Some("npr")
	val isFirstName = attributes.get("po") == Some("prn")
	}

	object HunspellEntry {
	def fromLine(line: String): Option[HunspellEntry] = {
	val parts = line.trim().split("""\s+""").toList
	val attributes =
	parts
	.drop(1)
	.map(_.split(":", 2))
	.collect { case Array(key, value) => key -> value }
	.toMap
	parts.headOption.getOrElse("").split("/", 2) match {
	case Array(word) => Some(HunspellEntry(word, None, attributes))
	case Array(word, flags) => Some(HunspellEntry(word, Some(flags), attributes))
	case _ => None
	}
	}
	}

	case class Hunspell(entries: List[HunspellEntry])

	def parseHunspell(dicBytes: Chunk[Byte], affixBytes: Chunk[Byte]) = {
	val charset = Charset.Standard.utf8
	for {
	content <- charset.decodeString(dicBytes)
	lines = content.split("\n").toList
	count <- ZIO.fromOption(lines.headOption.map(_.toInt))
	_ <- Console.printLine(s"Expecting to find $count hunspell entries")
	specs = lines.tail
	entries = specs.flatMap(HunspellEntry.fromLine)
	_ <- Console.printLine(s"Found ${entries.size} hunspell entries")
	// hunspell <- ZIO.cond(entries.size == count, Hunspell(entries), Error("Didn't find the right number of words in dictionary"))
	hunspell = Hunspell(entries) // No check as count input data looks invalid :(
	} yield hunspell
	}

	def dumpStats(hunspell: Hunspell) = {
	val selectedWords = hunspell.entries.filter(entry => entry.isCommun && !entry.isCompound)
	val wordsBySize = selectedWords.groupBy(_.word.size)
	val countBySize = wordsBySize.view.mapValues(_.size).toMap
	val longestWords = wordsBySize.get(wordsBySize.keys.max).getOrElse(Nil).map(_.word)
	for {
	_ <- Console.printLine(s"For common & not compound words")
	_ <- Console.printLine(s" Found ${selectedWords.size} words")
	_ <- Console.printLine(s" Number of common & not compound words By size")
	_ <- Console.printLine(countBySize.toList.sorted.mkString(" ", "\n ", "\n"))
	_ <- Console.printLine(s" Longest french words")
	_ <- Console.printLine(longestWords.sorted.mkString(" ", "\n ", "\n"))
	_ <- Console.printLine(s"Found ${hunspell.entries.size} words in the dictionary")
	} yield ()
	}

	def naiveSearch(hunspell: Hunspell, pattern: String, excludedLetters: String = "", includedLetters: String = ""): List[String] = {
	def normalize(word: String): String =
	word.toLowerCase
	.replaceAll("[áàäâ]", "a")
	.replaceAll("[éèëê]", "e")
	.replaceAll("[íìïî]", "i")
	.replaceAll("[óòöô]", "o")
	.replaceAll("[úùüû]", "u")
	.replaceAll("[ç]", "c")
	.toUpperCase

	val wordRE = pattern.replaceAll("_", ".").r
	hunspell.entries.view
	.filterNot(_.isCompound)
	.filter(_.isCommun)
	.map(_.word)
	.filter(_.size == pattern.size)
	.map(normalize)
	.filter(wordRE.matches)
	.filterNot(_.exists(excludedLetters.contains))
	.filter(word => includedLetters.forall(word.contains))
	.toList
	}

	override def run = for {
	_ <- if (!destDicoZipFile.toFile.exists()) downloadLogic(destDicoZipFile) else ZIO.succeed(())
	affBytes <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.aff")
	dicBytes <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.dic")
	hunspell <- parseHunspell(dicBytes, affBytes)
	_ <- dumpStats(hunspell)
	givenToSolve <- getArgs.map(_.headOption)
	givenIncludedLetters <- getArgs.map(_.drop(1).headOption)
	givenExcludedLetters <- getArgs.map(_.drop(2).headOption)
	toSolve = givenToSolve.map(_.toUpperCase).getOrElse("S_______")
	excludedLetters = givenExcludedLetters.map(_.toUpperCase).getOrElse("")
	includedLetters = givenIncludedLetters.map(_.toUpperCase).getOrElse("")
	solveOri = toSolve.head + toSolve.tail.map(_ => '_')
	_ <- Console.printLine(s"Possible solution count ${naiveSearch(hunspell, solveOri).size} for $solveOri")
	_ <- Console.printLine(s"Candidate solutions for $toSolve while including '$includedLetters' and excluding '$excludedLetters'")
	_ <- Console.printLine(naiveSearch(hunspell, toSolve, excludedLetters, includedLetters).mkString(" "))
	} yield ()

	}

	WordleDic.main(Array.empty)