Skip to content

Instantly share code, notes, and snippets.

@anezih
Last active March 27, 2023 01:10
Show Gist options
  • Save anezih/5e0fc6d68c9166fe2ea3ffc05bc68476 to your computer and use it in GitHub Desktop.
Save anezih/5e0fc6d68c9166fe2ea3ffc05bc68476 to your computer and use it in GitHub Desktop.
[OBSOLETE: Use https://github.com/anezih/HunspellWordForms] Hunspell's wordforms and unmunch defined in a single class library. Put WeCantSpell.Hunspell and compiled WordForms.cs dll in the same directory with Unmunch.ps1
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net7.0</TargetFramework>
<Version>1.4.0</Version>
</PropertyGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net7.0' ">
<PackageReference Include="WeCantSpell.Hunspell" Version="4.0.0" />
</ItemGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<DebugSymbols>False</DebugSymbols>
<DebugType>None</DebugType>
</PropertyGroup>
</Project>
[CmdletBinding()]
param (
[Parameter(Mandatory=$true)]
[string]
$DictionaryPath,
[bool]
$NoPFX = $false,
[bool]
$NoCross = $false,
[bool]
$Indented = $true,
[string]
$OutPath
)
Add-Type -AssemblyName $PSScriptRoot\HunspellWordForms.dll
# Put WeCantSpell.Hunspell.dll in the same directory with the dll above.
$dict = [WordForms]::new($DictionaryPath)
if($OutPath)
{
$dict.SerializeToJson("$($OutPath)", $Indented, $NoPFX, $false, $NoCross)
}
else
{
$dict.SerializeToJson("unmunched", $Indented, $NoPFX, $false, $NoCross)
}
// Adapted from: https://gist.github.com/aarondandy/aaa622afeeb0cb86b0d4efe697c23be5
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.Encodings.Web;
using System.Text.Json;
using System.Text.Unicode;
using WeCantSpell.Hunspell;
public class WordForms
{
public class WordFormsObj
{
public HashSet<string> PFX;
public HashSet<string> SFX;
public HashSet<string> Cross;
public WordFormsObj(HashSet<string> p, HashSet<string> s, HashSet<string> c)
{
this.PFX = p;
this.SFX = s;
this.Cross = c;
}
public bool IsEmpty()
{
if (this.PFX.Count == 0 & this.SFX.Count == 0 & this.Cross.Count == 0)
{
return true;
}
else return false;
}
}
private class Result
{
public bool Successful { get; set; }
public string WordWithAffix { get; set; }
}
private WordList dict;
private AffixEntryOptions flag = AffixEntryOptions.CrossProduct;
public WordForms(string path)
{
this.dict = WordList.CreateFromFiles(path);
}
private bool AllowCross(AffixEntryOptions value)
{
return (value & this.flag) == this.flag;
}
private Result TryAppend(PrefixEntry prefix, string word)
{
Result res = new Result();
if (prefix.Conditions.IsStartingMatch(word.AsSpan()) && word.StartsWith(prefix.Strip))
{
res.Successful = true;
res.WordWithAffix = prefix.Append + word.Substring(prefix.Strip.Length);
return res;
}
else
{
res.Successful = false;
return res;
}
}
private Result TryAppend(SuffixEntry suffix, string word)
{
Result res = new Result();
if (suffix.Conditions.IsEndingMatch(word.AsSpan()) && word.EndsWith(suffix.Strip))
{
res.Successful = true;
res.WordWithAffix = word.Substring(0, word.Length - suffix.Strip.Length) + suffix.Append;
return res;
}
else
{
res.Successful = false;
return res;
}
}
public WordFormsObj GetWordForms(string word, bool NoPFX = false, bool NoSFX = false, bool NoCross = false)
{
List<AffixGroup<PrefixEntry>> AllPrefixes = new List<AffixGroup<PrefixEntry>>();
List<AffixGroup<SuffixEntry>> AllSuffixes = new List<AffixGroup<SuffixEntry>>();
HashSet<string> wp = new HashSet<string>();
HashSet<string> ws = new HashSet<string>();
HashSet<string> wc = new HashSet<string>();
try
{
var item = this.dict[word];
foreach (var p in this.dict.Affix.Prefixes)
{
foreach(var i in item)
{
if (i.ContainsFlag(p.AFlag))
{
AllPrefixes.Add(p);
}
}
}
foreach (var s in this.dict.Affix.Suffixes)
{
foreach(var i in item)
{
if (i.ContainsFlag(s.AFlag))
{
AllSuffixes.Add(s);
}
}
}
}
catch (System.Exception)
{
}
if (!NoPFX)
{
foreach (var prefixEntry in AllPrefixes.SelectMany(p => p.Entries))
{
Result _out = TryAppend(prefixEntry, word);
if (_out.Successful)
{
wp.Add(_out.WordWithAffix);
}
}
}
if (!NoSFX)
{
foreach (var suffixEntry in AllSuffixes.SelectMany(p => p.Entries))
{
Result _out = TryAppend(suffixEntry, word);
if (_out.Successful)
{
ws.Add(_out.WordWithAffix);
}
}
}
if (!NoCross)
{
foreach (var prefixEntry in AllPrefixes.Where(p => AllowCross(p.Options)).SelectMany(p => p.Entries))
{
Result withPrefix = TryAppend(prefixEntry, word);
if (withPrefix.Successful)
{
foreach (var suffixEntry in AllSuffixes.Where(p => AllowCross(p.Options)).SelectMany(p => p.Entries))
{
Result crossOut = TryAppend(suffixEntry, withPrefix.WordWithAffix);
if (crossOut.Successful)
{
wc.Add(crossOut.WordWithAffix);
}
}
}
}
}
WordFormsObj res = new WordFormsObj(wp, ws, wc);
return res;
}
public List<Hashtable> Unmunch(bool NoPFX = false, bool NoSFX = false, bool NoCross = false)
{
List<Hashtable> allForms = new List<Hashtable>();
List<string> allWords = this.dict.RootWords.ToList();
allWords.Sort();
foreach (var w in allWords)
{
Hashtable tempTable = new Hashtable();
WordFormsObj forms = GetWordForms(w, NoPFX:NoPFX, NoSFX:NoSFX, NoCross:NoCross);
if (!forms.IsEmpty())
{
tempTable.Add(w, forms);
allForms.Add(tempTable);
}
}
return allForms;
}
public async void SerializeToJson(string OutFileName, bool Indented = false, bool NoPFX = false, bool NoSFX = false, bool NoCross = false)
{
var unmunched = Unmunch(NoPFX:NoPFX, NoSFX:NoSFX, NoCross:NoCross);
string fileName = $"{OutFileName}.json";
var options = new JsonSerializerOptions
{
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
WriteIndented = Indented,
IncludeFields = true,
};
using FileStream createStream = File.Create(fileName);
await JsonSerializer.SerializeAsync(createStream, unmunched, options);
await createStream.DisposeAsync();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment