Last active
March 27, 2023 01:10
-
-
Save anezih/5e0fc6d68c9166fe2ea3ffc05bc68476 to your computer and use it in GitHub Desktop.
[OBSOLETE: Use https://github.com/anezih/HunspellWordForms] Hunspell's wordforms and unmunch defined in a single class library. Put WeCantSpell.Hunspell and compiled WordForms.cs dll in the same directory with Unmunch.ps1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Project Sdk="Microsoft.NET.Sdk"> | |
<PropertyGroup> | |
<TargetFramework>net7.0</TargetFramework> | |
<Version>1.4.0</Version> | |
</PropertyGroup> | |
<ItemGroup Condition=" '$(TargetFramework)' == 'net7.0' "> | |
<PackageReference Include="WeCantSpell.Hunspell" Version="4.0.0" /> | |
</ItemGroup> | |
<PropertyGroup Condition="'$(Configuration)'=='Release'"> | |
<DebugSymbols>False</DebugSymbols> | |
<DebugType>None</DebugType> | |
</PropertyGroup> | |
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[CmdletBinding()] | |
param ( | |
[Parameter(Mandatory=$true)] | |
[string] | |
$DictionaryPath, | |
[bool] | |
$NoPFX = $false, | |
[bool] | |
$NoCross = $false, | |
[bool] | |
$Indented = $true, | |
[string] | |
$OutPath | |
) | |
Add-Type -AssemblyName $PSScriptRoot\HunspellWordForms.dll | |
# Put WeCantSpell.Hunspell.dll in the same directory with the dll above. | |
$dict = [WordForms]::new($DictionaryPath) | |
if($OutPath) | |
{ | |
$dict.SerializeToJson("$($OutPath)", $Indented, $NoPFX, $false, $NoCross) | |
} | |
else | |
{ | |
$dict.SerializeToJson("unmunched", $Indented, $NoPFX, $false, $NoCross) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Adapted from: https://gist.github.com/aarondandy/aaa622afeeb0cb86b0d4efe697c23be5 | |
using System; | |
using System.Collections; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.Encodings.Web; | |
using System.Text.Json; | |
using System.Text.Unicode; | |
using WeCantSpell.Hunspell; | |
public class WordForms | |
{ | |
public class WordFormsObj | |
{ | |
public HashSet<string> PFX; | |
public HashSet<string> SFX; | |
public HashSet<string> Cross; | |
public WordFormsObj(HashSet<string> p, HashSet<string> s, HashSet<string> c) | |
{ | |
this.PFX = p; | |
this.SFX = s; | |
this.Cross = c; | |
} | |
public bool IsEmpty() | |
{ | |
if (this.PFX.Count == 0 & this.SFX.Count == 0 & this.Cross.Count == 0) | |
{ | |
return true; | |
} | |
else return false; | |
} | |
} | |
private class Result | |
{ | |
public bool Successful { get; set; } | |
public string WordWithAffix { get; set; } | |
} | |
private WordList dict; | |
private AffixEntryOptions flag = AffixEntryOptions.CrossProduct; | |
public WordForms(string path) | |
{ | |
this.dict = WordList.CreateFromFiles(path); | |
} | |
private bool AllowCross(AffixEntryOptions value) | |
{ | |
return (value & this.flag) == this.flag; | |
} | |
private Result TryAppend(PrefixEntry prefix, string word) | |
{ | |
Result res = new Result(); | |
if (prefix.Conditions.IsStartingMatch(word.AsSpan()) && word.StartsWith(prefix.Strip)) | |
{ | |
res.Successful = true; | |
res.WordWithAffix = prefix.Append + word.Substring(prefix.Strip.Length); | |
return res; | |
} | |
else | |
{ | |
res.Successful = false; | |
return res; | |
} | |
} | |
private Result TryAppend(SuffixEntry suffix, string word) | |
{ | |
Result res = new Result(); | |
if (suffix.Conditions.IsEndingMatch(word.AsSpan()) && word.EndsWith(suffix.Strip)) | |
{ | |
res.Successful = true; | |
res.WordWithAffix = word.Substring(0, word.Length - suffix.Strip.Length) + suffix.Append; | |
return res; | |
} | |
else | |
{ | |
res.Successful = false; | |
return res; | |
} | |
} | |
public WordFormsObj GetWordForms(string word, bool NoPFX = false, bool NoSFX = false, bool NoCross = false) | |
{ | |
List<AffixGroup<PrefixEntry>> AllPrefixes = new List<AffixGroup<PrefixEntry>>(); | |
List<AffixGroup<SuffixEntry>> AllSuffixes = new List<AffixGroup<SuffixEntry>>(); | |
HashSet<string> wp = new HashSet<string>(); | |
HashSet<string> ws = new HashSet<string>(); | |
HashSet<string> wc = new HashSet<string>(); | |
try | |
{ | |
var item = this.dict[word]; | |
foreach (var p in this.dict.Affix.Prefixes) | |
{ | |
foreach(var i in item) | |
{ | |
if (i.ContainsFlag(p.AFlag)) | |
{ | |
AllPrefixes.Add(p); | |
} | |
} | |
} | |
foreach (var s in this.dict.Affix.Suffixes) | |
{ | |
foreach(var i in item) | |
{ | |
if (i.ContainsFlag(s.AFlag)) | |
{ | |
AllSuffixes.Add(s); | |
} | |
} | |
} | |
} | |
catch (System.Exception) | |
{ | |
} | |
if (!NoPFX) | |
{ | |
foreach (var prefixEntry in AllPrefixes.SelectMany(p => p.Entries)) | |
{ | |
Result _out = TryAppend(prefixEntry, word); | |
if (_out.Successful) | |
{ | |
wp.Add(_out.WordWithAffix); | |
} | |
} | |
} | |
if (!NoSFX) | |
{ | |
foreach (var suffixEntry in AllSuffixes.SelectMany(p => p.Entries)) | |
{ | |
Result _out = TryAppend(suffixEntry, word); | |
if (_out.Successful) | |
{ | |
ws.Add(_out.WordWithAffix); | |
} | |
} | |
} | |
if (!NoCross) | |
{ | |
foreach (var prefixEntry in AllPrefixes.Where(p => AllowCross(p.Options)).SelectMany(p => p.Entries)) | |
{ | |
Result withPrefix = TryAppend(prefixEntry, word); | |
if (withPrefix.Successful) | |
{ | |
foreach (var suffixEntry in AllSuffixes.Where(p => AllowCross(p.Options)).SelectMany(p => p.Entries)) | |
{ | |
Result crossOut = TryAppend(suffixEntry, withPrefix.WordWithAffix); | |
if (crossOut.Successful) | |
{ | |
wc.Add(crossOut.WordWithAffix); | |
} | |
} | |
} | |
} | |
} | |
WordFormsObj res = new WordFormsObj(wp, ws, wc); | |
return res; | |
} | |
public List<Hashtable> Unmunch(bool NoPFX = false, bool NoSFX = false, bool NoCross = false) | |
{ | |
List<Hashtable> allForms = new List<Hashtable>(); | |
List<string> allWords = this.dict.RootWords.ToList(); | |
allWords.Sort(); | |
foreach (var w in allWords) | |
{ | |
Hashtable tempTable = new Hashtable(); | |
WordFormsObj forms = GetWordForms(w, NoPFX:NoPFX, NoSFX:NoSFX, NoCross:NoCross); | |
if (!forms.IsEmpty()) | |
{ | |
tempTable.Add(w, forms); | |
allForms.Add(tempTable); | |
} | |
} | |
return allForms; | |
} | |
public async void SerializeToJson(string OutFileName, bool Indented = false, bool NoPFX = false, bool NoSFX = false, bool NoCross = false) | |
{ | |
var unmunched = Unmunch(NoPFX:NoPFX, NoSFX:NoSFX, NoCross:NoCross); | |
string fileName = $"{OutFileName}.json"; | |
var options = new JsonSerializerOptions | |
{ | |
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, | |
WriteIndented = Indented, | |
IncludeFields = true, | |
}; | |
using FileStream createStream = File.Create(fileName); | |
await JsonSerializer.SerializeAsync(createStream, unmunched, options); | |
await createStream.DisposeAsync(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment