main/Smdn.Text.Ondulish-4.0.0-preview1
Created
January 1, 2023 13:18
-
-
Save smdn/f062ea075ec83964079e437e282f4746 to your computer and use it in GitHub Desktop.
Smdn.Text.Ondulish 4.0.0-preview1 Release Notes
- 2023-01-01 update assembly version
- 2023-01-01 make internal classes sealed
- 2022-12-29 fix the phase of including swig bindings to the compile items
- 2022-12-29 install dependency packages for build on GitHub actions
- 2022-12-29 fix Makefile directory
- 2022-12-29 generate MeCab IPA dictionary ahead of building project
- 2022-12-29 make sure to include generated SWIG binding sources to the compile items
- 2022-12-28 set default value to the parameter 'convertKatakanaToNarrow'
- 2022-12-28 change order of parameters
- 2022-12-28 add Translate() overload to accept TextReader
- 2022-12-28 make field static
- 2022-12-28 split implementations into individual files
- 2022-12-28 remove code for debugging
- 2022-12-28 refer assembly directory instead of process directory for loading bundled mecab dictionary
- 2022-12-28 fix Translator constructors to accept Tagger that created externally
- 2022-12-27 improve nupkg content
- 2022-12-27 add fallback impementation
- 2022-12-27 bump Smdn.Fundamental.Csv up to 3.1.0
- 2022-12-26 improve end-of-line treatments
- 2022-12-26 add OndulishDictionaries.Open(Phrase|Word)DictionaryStream() and use them
- 2022-12-25 make Ondulish dictionaries as an individual package
- 2022-12-25 use using statements
- 2022-12-25 add support for loading dictionaries from Stream
- 2022-12-25 set default value to parameter 'convertKatakanaToNarrow'
- 2022-12-25 add default constructor
- 2022-12-25 fix to specify mecab dictionary path from the code instead of using mecabrc
- 2022-12-25 change to output mecab files under the 'mecab' directory
- 2022-12-25 clarify pack target files and its package path
- 2022-12-24 add RuntimeIdentifiers
- 2022-12-24 add MSBuild targets for calling mecab make targets
- 2022-12-24 reduce allocations
- 2022-12-24 make lambda static
- 2022-12-24 enable nullability annotations
- 2022-12-24 check arguments
- 2022-12-24 check arguments
- 2022-12-23 implement IDisposable properly
- 2022-12-23 use switch expressions
- 2022-12-23 improve #if switches
- 2022-12-23 fix warnings proposed from code analyzer
- 2022-12-23 add notes
- 2022-12-23 use SPDX license headers
- 2022-12-23 use Smdn.MSBuild.ProjectAssets.Library
- 2022-12-22 rewrite build script for MeCab with Makefile
- 2022-12-22 use NuGet packages instead of local project
- 2022-12-22 use the target framework net6.0 instead of net5.0
- 2022-12-22 change namespaces to Smdn.Text.Ondulish
- 2022-12-22 rename assembly name to Smdn.Text.Ondulish
- 2022-12-22 rename assembly and namespace directories to Smdn.Text.Ondulish
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/doc/api-list/Smdn.Text.Ondulish/Smdn.Text.Ondulish-net6.0.apilist.cs b/doc/api-list/Smdn.Text.Ondulish/Smdn.Text.Ondulish-net6.0.apilist.cs | |
new file mode 100644 | |
index 0000000..4d6157c | |
--- /dev/null | |
+++ b/doc/api-list/Smdn.Text.Ondulish/Smdn.Text.Ondulish-net6.0.apilist.cs | |
@@ -0,0 +1,223 @@ | |
+// Smdn.Text.Ondulish.dll (Smdn.Text.Ondulish-4.0.0-preview1) | |
+// Name: Smdn.Text.Ondulish | |
+// AssemblyVersion: 4.0.0.0 | |
+// InformationalVersion: 4.0.0-preview1+e47f0cd7079b79c34ed00e252d68a337f739d938 | |
+// TargetFramework: .NETCoreApp,Version=v6.0 | |
+// Configuration: Release | |
+#nullable enable annotations | |
+ | |
+using System; | |
+using System.Collections.Generic; | |
+using System.IO; | |
+using MeCab; | |
+ | |
+namespace MeCab { | |
+ public class DictionaryInfo : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public DictionaryInfo() {} | |
+ | |
+ public string charset { get; } | |
+ public string filename { get; } | |
+ public uint lsize { get; } | |
+ public DictionaryInfo next { get; } | |
+ public uint rsize { get; } | |
+ public uint size { get; } | |
+ public int type { get; } | |
+ public ushort version { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~DictionaryInfo() {} | |
+ } | |
+ | |
+ public class Lattice : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public Lattice() {} | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Lattice() {} | |
+ public virtual double Z() {} | |
+ public virtual void add_request_type(int request_type) {} | |
+ public virtual Node begin_nodes(uint pos) {} | |
+ public virtual Node bos_node() {} | |
+ public virtual int boundary_constraint(uint pos) {} | |
+ public virtual void clear() {} | |
+ public virtual Node end_nodes(uint pos) {} | |
+ public virtual string enumNBestAsString(uint N) {} | |
+ public virtual Node eos_node() {} | |
+ public virtual string feature_constraint(uint pos) {} | |
+ public virtual bool has_constraint() {} | |
+ public virtual bool has_request_type(int request_type) {} | |
+ public virtual bool is_available() {} | |
+ public virtual Node newNode() {} | |
+ public virtual bool next() {} | |
+ public virtual void remove_request_type(int request_type) {} | |
+ public virtual int request_type() {} | |
+ public virtual string sentence() {} | |
+ public virtual void set_Z(double Z) {} | |
+ public virtual void set_boundary_constraint(uint pos, int boundary_constraint_type) {} | |
+ public virtual void set_feature_constraint(uint begin_pos, uint end_pos, string feature) {} | |
+ public virtual void set_request_type(int request_type) {} | |
+ public virtual void set_result(string result) {} | |
+ public void set_sentence(string sentence) {} | |
+ public virtual void set_theta(float theta) {} | |
+ public virtual void set_what(string str) {} | |
+ public virtual uint size() {} | |
+ public virtual float theta() {} | |
+ public virtual string toString() {} | |
+ public virtual string toString(Node node) {} | |
+ public virtual string what() {} | |
+ } | |
+ | |
+ public class MeCab { | |
+ public static readonly int MECAB_ALLOCATE_SENTENCE = 64; | |
+ public static readonly int MECAB_ALL_MORPHS = 32; | |
+ public static readonly int MECAB_ALTERNATIVE = 16; | |
+ public static readonly int MECAB_ANY_BOUNDARY = 0; | |
+ public static readonly int MECAB_BOS_NODE = 2; | |
+ public static readonly int MECAB_EON_NODE = 4; | |
+ public static readonly int MECAB_EOS_NODE = 3; | |
+ public static readonly int MECAB_INSIDE_TOKEN = 2; | |
+ public static readonly int MECAB_MARGINAL_PROB = 8; | |
+ public static readonly int MECAB_NBEST = 2; | |
+ public static readonly int MECAB_NOR_NODE = 0; | |
+ public static readonly int MECAB_ONE_BEST = 1; | |
+ public static readonly int MECAB_PARTIAL = 4; | |
+ public static readonly int MECAB_SYS_DIC = 0; | |
+ public static readonly int MECAB_TOKEN_BOUNDARY = 1; | |
+ public static readonly int MECAB_UNK_DIC = 2; | |
+ public static readonly int MECAB_UNK_NODE = 1; | |
+ public static readonly int MECAB_USR_DIC = 1; | |
+ public static readonly string VERSION = "0.996"; | |
+ | |
+ public MeCab() {} | |
+ } | |
+ | |
+ public class Model : IDisposable { | |
+ public static string version() {} | |
+ | |
+ protected bool swigCMemOwn; | |
+ | |
+ public Model() {} | |
+ public Model(string argc) {} | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Model() {} | |
+ public virtual Lattice createLattice() {} | |
+ public virtual Tagger createTagger() {} | |
+ public virtual DictionaryInfo dictionary_info() {} | |
+ public virtual Node lookup(string begin, string end, Lattice lattice) {} | |
+ public virtual bool swap(Model model) {} | |
+ public virtual int transition_cost(ushort rcAttr, ushort lcAttr) {} | |
+ } | |
+ | |
+ public class Node : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public float alpha { get; } | |
+ public float beta { get; } | |
+ public Node bnext { get; } | |
+ public byte char_type { get; } | |
+ public int cost { get; } | |
+ public Node enext { get; } | |
+ public string feature { get; } | |
+ public uint id { get; } | |
+ public byte isbest { get; } | |
+ public ushort lcAttr { get; } | |
+ public ushort length { get; } | |
+ public Path lpath { get; } | |
+ public Node next { get; } | |
+ public ushort posid { get; } | |
+ public Node prev { get; } | |
+ public float prob { get; set; } | |
+ public ushort rcAttr { get; } | |
+ public ushort rlength { get; } | |
+ public Path rpath { get; } | |
+ public byte stat { get; } | |
+ public string surface { get; } | |
+ public short wcost { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Node() {} | |
+ } | |
+ | |
+ public class Path : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public int cost { get; } | |
+ public Path lnext { get; } | |
+ public Node lnode { get; } | |
+ public float prob { get; set; } | |
+ public Path rnext { get; } | |
+ public Node rnode { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Path() {} | |
+ } | |
+ | |
+ public class Tagger : IDisposable { | |
+ public static bool parse(Model model, Lattice lattice) {} | |
+ public static string version() {} | |
+ | |
+ protected bool swigCMemOwn; | |
+ | |
+ public Tagger() {} | |
+ public Tagger(string argc) {} | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Tagger() {} | |
+ public virtual bool all_morphs() {} | |
+ public virtual DictionaryInfo dictionary_info() {} | |
+ public virtual string formatNode(Node node) {} | |
+ public virtual int lattice_level() {} | |
+ public virtual string next() {} | |
+ public virtual Node nextNode() {} | |
+ public virtual bool parse(Lattice lattice) {} | |
+ public virtual string parse(string str) {} | |
+ public virtual string parseNBest(uint N, string str) {} | |
+ public virtual bool parseNBestInit(string str) {} | |
+ public virtual Node parseToNode(string str) {} | |
+ public string parseToString(string str) {} | |
+ public string parseToString(string str, uint length) {} | |
+ public virtual bool partial() {} | |
+ public virtual int request_type() {} | |
+ public virtual void set_all_morphs(bool all_morphs) {} | |
+ public virtual void set_lattice_level(int level) {} | |
+ public virtual void set_partial(bool @partial) {} | |
+ public virtual void set_request_type(int request_type) {} | |
+ public virtual void set_theta(float theta) {} | |
+ public virtual float theta() {} | |
+ public virtual string what() {} | |
+ } | |
+} | |
+ | |
+namespace Smdn.Text.Ondulish { | |
+ public static class KanaUtils { | |
+ public static string ConvertWideHiraganaToKatakana(string input) {} | |
+ public static string ConvertWideKatakanaToHiragana(string input) {} | |
+ public static string ConvertWideKatakanaToNarrowKatakana(string input) {} | |
+ } | |
+ | |
+ public class Translator : IDisposable { | |
+ public static Tagger CreateTaggerForBundledDictionary() {} | |
+ | |
+ public Translator() {} | |
+ public Translator(Tagger tagger, bool shouldDisposeTagger) {} | |
+ | |
+ public IReadOnlyDictionary<string, string> PhraseDictionary { get; } | |
+ public IReadOnlyDictionary<string, string> WordDictionary { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ public string Translate(string input, bool convertKatakanaToNarrow = true) {} | |
+ public void Translate(TextReader input, TextWriter output, bool convertKatakanaToNarrow = true) {} | |
+ public void Translate(string input, TextWriter output, bool convertKatakanaToNarrow = true) {} | |
+ } | |
+} | |
diff --git a/doc/api-list/Smdn.Text.Ondulish/Smdn.Text.Ondulish-netstandard2.1.apilist.cs b/doc/api-list/Smdn.Text.Ondulish/Smdn.Text.Ondulish-netstandard2.1.apilist.cs | |
new file mode 100644 | |
index 0000000..17e77b7 | |
--- /dev/null | |
+++ b/doc/api-list/Smdn.Text.Ondulish/Smdn.Text.Ondulish-netstandard2.1.apilist.cs | |
@@ -0,0 +1,223 @@ | |
+// Smdn.Text.Ondulish.dll (Smdn.Text.Ondulish-4.0.0-preview1) | |
+// Name: Smdn.Text.Ondulish | |
+// AssemblyVersion: 4.0.0.0 | |
+// InformationalVersion: 4.0.0-preview1+e47f0cd7079b79c34ed00e252d68a337f739d938 | |
+// TargetFramework: .NETStandard,Version=v2.1 | |
+// Configuration: Release | |
+#nullable enable annotations | |
+ | |
+using System; | |
+using System.Collections.Generic; | |
+using System.IO; | |
+using MeCab; | |
+ | |
+namespace MeCab { | |
+ public class DictionaryInfo : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public DictionaryInfo() {} | |
+ | |
+ public string charset { get; } | |
+ public string filename { get; } | |
+ public uint lsize { get; } | |
+ public DictionaryInfo next { get; } | |
+ public uint rsize { get; } | |
+ public uint size { get; } | |
+ public int type { get; } | |
+ public ushort version { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~DictionaryInfo() {} | |
+ } | |
+ | |
+ public class Lattice : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public Lattice() {} | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Lattice() {} | |
+ public virtual double Z() {} | |
+ public virtual void add_request_type(int request_type) {} | |
+ public virtual Node begin_nodes(uint pos) {} | |
+ public virtual Node bos_node() {} | |
+ public virtual int boundary_constraint(uint pos) {} | |
+ public virtual void clear() {} | |
+ public virtual Node end_nodes(uint pos) {} | |
+ public virtual string enumNBestAsString(uint N) {} | |
+ public virtual Node eos_node() {} | |
+ public virtual string feature_constraint(uint pos) {} | |
+ public virtual bool has_constraint() {} | |
+ public virtual bool has_request_type(int request_type) {} | |
+ public virtual bool is_available() {} | |
+ public virtual Node newNode() {} | |
+ public virtual bool next() {} | |
+ public virtual void remove_request_type(int request_type) {} | |
+ public virtual int request_type() {} | |
+ public virtual string sentence() {} | |
+ public virtual void set_Z(double Z) {} | |
+ public virtual void set_boundary_constraint(uint pos, int boundary_constraint_type) {} | |
+ public virtual void set_feature_constraint(uint begin_pos, uint end_pos, string feature) {} | |
+ public virtual void set_request_type(int request_type) {} | |
+ public virtual void set_result(string result) {} | |
+ public void set_sentence(string sentence) {} | |
+ public virtual void set_theta(float theta) {} | |
+ public virtual void set_what(string str) {} | |
+ public virtual uint size() {} | |
+ public virtual float theta() {} | |
+ public virtual string toString() {} | |
+ public virtual string toString(Node node) {} | |
+ public virtual string what() {} | |
+ } | |
+ | |
+ public class MeCab { | |
+ public static readonly int MECAB_ALLOCATE_SENTENCE = 64; | |
+ public static readonly int MECAB_ALL_MORPHS = 32; | |
+ public static readonly int MECAB_ALTERNATIVE = 16; | |
+ public static readonly int MECAB_ANY_BOUNDARY = 0; | |
+ public static readonly int MECAB_BOS_NODE = 2; | |
+ public static readonly int MECAB_EON_NODE = 4; | |
+ public static readonly int MECAB_EOS_NODE = 3; | |
+ public static readonly int MECAB_INSIDE_TOKEN = 2; | |
+ public static readonly int MECAB_MARGINAL_PROB = 8; | |
+ public static readonly int MECAB_NBEST = 2; | |
+ public static readonly int MECAB_NOR_NODE = 0; | |
+ public static readonly int MECAB_ONE_BEST = 1; | |
+ public static readonly int MECAB_PARTIAL = 4; | |
+ public static readonly int MECAB_SYS_DIC = 0; | |
+ public static readonly int MECAB_TOKEN_BOUNDARY = 1; | |
+ public static readonly int MECAB_UNK_DIC = 2; | |
+ public static readonly int MECAB_UNK_NODE = 1; | |
+ public static readonly int MECAB_USR_DIC = 1; | |
+ public static readonly string VERSION = "0.996"; | |
+ | |
+ public MeCab() {} | |
+ } | |
+ | |
+ public class Model : IDisposable { | |
+ public static string version() {} | |
+ | |
+ protected bool swigCMemOwn; | |
+ | |
+ public Model() {} | |
+ public Model(string argc) {} | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Model() {} | |
+ public virtual Lattice createLattice() {} | |
+ public virtual Tagger createTagger() {} | |
+ public virtual DictionaryInfo dictionary_info() {} | |
+ public virtual Node lookup(string begin, string end, Lattice lattice) {} | |
+ public virtual bool swap(Model model) {} | |
+ public virtual int transition_cost(ushort rcAttr, ushort lcAttr) {} | |
+ } | |
+ | |
+ public class Node : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public float alpha { get; } | |
+ public float beta { get; } | |
+ public Node bnext { get; } | |
+ public byte char_type { get; } | |
+ public int cost { get; } | |
+ public Node enext { get; } | |
+ public string feature { get; } | |
+ public uint id { get; } | |
+ public byte isbest { get; } | |
+ public ushort lcAttr { get; } | |
+ public ushort length { get; } | |
+ public Path lpath { get; } | |
+ public Node next { get; } | |
+ public ushort posid { get; } | |
+ public Node prev { get; } | |
+ public float prob { get; set; } | |
+ public ushort rcAttr { get; } | |
+ public ushort rlength { get; } | |
+ public Path rpath { get; } | |
+ public byte stat { get; } | |
+ public string surface { get; } | |
+ public short wcost { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Node() {} | |
+ } | |
+ | |
+ public class Path : IDisposable { | |
+ protected bool swigCMemOwn; | |
+ | |
+ public int cost { get; } | |
+ public Path lnext { get; } | |
+ public Node lnode { get; } | |
+ public float prob { get; set; } | |
+ public Path rnext { get; } | |
+ public Node rnode { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Path() {} | |
+ } | |
+ | |
+ public class Tagger : IDisposable { | |
+ public static bool parse(Model model, Lattice lattice) {} | |
+ public static string version() {} | |
+ | |
+ protected bool swigCMemOwn; | |
+ | |
+ public Tagger() {} | |
+ public Tagger(string argc) {} | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ ~Tagger() {} | |
+ public virtual bool all_morphs() {} | |
+ public virtual DictionaryInfo dictionary_info() {} | |
+ public virtual string formatNode(Node node) {} | |
+ public virtual int lattice_level() {} | |
+ public virtual string next() {} | |
+ public virtual Node nextNode() {} | |
+ public virtual bool parse(Lattice lattice) {} | |
+ public virtual string parse(string str) {} | |
+ public virtual string parseNBest(uint N, string str) {} | |
+ public virtual bool parseNBestInit(string str) {} | |
+ public virtual Node parseToNode(string str) {} | |
+ public string parseToString(string str) {} | |
+ public string parseToString(string str, uint length) {} | |
+ public virtual bool partial() {} | |
+ public virtual int request_type() {} | |
+ public virtual void set_all_morphs(bool all_morphs) {} | |
+ public virtual void set_lattice_level(int level) {} | |
+ public virtual void set_partial(bool @partial) {} | |
+ public virtual void set_request_type(int request_type) {} | |
+ public virtual void set_theta(float theta) {} | |
+ public virtual float theta() {} | |
+ public virtual string what() {} | |
+ } | |
+} | |
+ | |
+namespace Smdn.Text.Ondulish { | |
+ public static class KanaUtils { | |
+ public static string ConvertWideHiraganaToKatakana(string input) {} | |
+ public static string ConvertWideKatakanaToHiragana(string input) {} | |
+ public static string ConvertWideKatakanaToNarrowKatakana(string input) {} | |
+ } | |
+ | |
+ public class Translator : IDisposable { | |
+ public static Tagger CreateTaggerForBundledDictionary() {} | |
+ | |
+ public Translator() {} | |
+ public Translator(Tagger tagger, bool shouldDisposeTagger) {} | |
+ | |
+ public IReadOnlyDictionary<string, string> PhraseDictionary { get; } | |
+ public IReadOnlyDictionary<string, string> WordDictionary { get; } | |
+ | |
+ protected virtual void Dispose(bool disposing) {} | |
+ public void Dispose() {} | |
+ public string Translate(string input, bool convertKatakanaToNarrow = true) {} | |
+ public void Translate(TextReader input, TextWriter output, bool convertKatakanaToNarrow = true) {} | |
+ public void Translate(string input, TextWriter output, bool convertKatakanaToNarrow = true) {} | |
+ } | |
+} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/Smdn.Text.Ondulish/MeCab.targets b/src/Smdn.Text.Ondulish/MeCab.targets | |
new file mode 100644 | |
index 0000000..d47b7ee | |
--- /dev/null | |
+++ b/src/Smdn.Text.Ondulish/MeCab.targets | |
@@ -0,0 +1,51 @@ | |
+<!-- | |
+SPDX-FileCopyrightText: 2022 smdn <smdn@smdn.jp> | |
+SPDX-License-Identifier: MIT | |
+--> | |
+<Project> | |
+ <PropertyGroup> | |
+ <BuildInParallel>false</BuildInParallel> <!-- disable parallel builds to avoid running simultaneous MeCab build --> | |
+ </PropertyGroup> | |
+ | |
+ <Target | |
+ Name="MeCabBindings_Build" | |
+ BeforeTargets="DispatchToInnerBuilds;BeforeBuild" | |
+ > | |
+ <Exec | |
+ Command="make install-buildtime-deps-ubuntu.22.04 -f install-deps.mk" | |
+ WorkingDirectory="$(MSBuildThisFileDirectory)..\..\eng\dependencies\" | |
+ Condition=" | |
+ '$(GITHUB_ACTIONS)' == 'true' and | |
+ $([System.Runtime.InteropServices.RuntimeInformation]::RuntimeIdentifier.StartsWith('ubuntu.22.04')) | |
+ " | |
+ /> | |
+ | |
+ <!-- generate MeCab shared library and SWIG bindings --> | |
+ <Exec | |
+ Command="make mecab-bindings" | |
+ WorkingDirectory="$(MeCabMakefileDirectory)" | |
+ /> | |
+ | |
+ <!-- make sure to include generated SWIG binding source files to the <Compile> items --> | |
+ <ItemGroup> | |
+ <Compile Include="$(MeCabBindingsDirectory)src\*.cs" /> | |
+ </ItemGroup> | |
+ | |
+ <!-- generate MeCab IPA dictionary --> | |
+ <Exec | |
+ Command="make mecab-ipadic" | |
+ WorkingDirectory="$(MeCabMakefileDirectory)" | |
+ /> | |
+ </Target> | |
+ | |
+ <Target | |
+ Name="MeCabBindings_Clean" | |
+ AfterTargets="Clean" | |
+ > | |
+ <!-- clean MeCab shared library and swig bindings --> | |
+ <Exec | |
+ Command="make clean-mecab-bindings" | |
+ WorkingDirectory="$(MeCabMakefileDirectory)" | |
+ /> | |
+ </Target> | |
+</Project> | |
diff --git a/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish.csproj b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish.csproj | |
new file mode 100644 | |
index 0000000..3b8c4b0 | |
--- /dev/null | |
+++ b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish.csproj | |
@@ -0,0 +1,132 @@ | |
+<!-- | |
+SPDX-FileCopyrightText: 2012 smdn <smdn@smdn.jp> | |
+SPDX-License-Identifier: MIT | |
+--> | |
+<Project Sdk="Microsoft.NET.Sdk"> | |
+ <PropertyGroup> | |
+ <TargetFrameworks>net6.0;netstandard2.1</TargetFrameworks> | |
+ <RuntimeIdentifiers>ubuntu.22.04-x64</RuntimeIdentifiers> | |
+ <VersionPrefix>4.0.0</VersionPrefix> | |
+ <VersionSuffix>preview1</VersionSuffix> | |
+ <!-- <PackageValidationBaselineVersion>4.0.0</PackageValidationBaselineVersion> --> | |
+ <Nullable>enable</Nullable> | |
+ <AssemblyCLSCompliant>false</AssemblyCLSCompliant> | |
+ <GenerateNupkgReadmeFileDependsOnTargets>$(GenerateNupkgReadmeFileDependsOnTargets);GenerateReadmeFileContent</GenerateNupkgReadmeFileDependsOnTargets> | |
+ </PropertyGroup> | |
+ | |
+ <PropertyGroup> | |
+ <MeCabMakefileDirectory>$(MSBuildThisFileDirectory)..\MeCab\</MeCabMakefileDirectory> | |
+ <MeCabBindingsDirectory>$(MSBuildThisFileDirectory)..\MeCab\mecab-bindings\</MeCabBindingsDirectory> | |
+ <MeCabIpaDicDirectory>$(MSBuildThisFileDirectory)..\MeCab\mecab\mecab-ipadic\</MeCabIpaDicDirectory> | |
+ <MeCabDeploymentBasePath>mecab\</MeCabDeploymentBasePath> | |
+ <MeCabIpaDicDeploymentBasePath>$(MeCabDeploymentBasePath)dic\ipadic\</MeCabIpaDicDeploymentBasePath> | |
+ </PropertyGroup> | |
+ | |
+ <PropertyGroup Label="assembly attributes"> | |
+ <Description>A text conversion library that provides translation features from Japanese to Ondulish.</Description> | |
+ <CopyrightYear>2012</CopyrightYear> | |
+ </PropertyGroup> | |
+ | |
+ <PropertyGroup Label="package properties"> | |
+ <PackageTags>joke;funny;text-converter;translator</PackageTags> | |
+ </PropertyGroup> | |
+ | |
+ <ItemGroup> | |
+ <PackageReference Include="Smdn.Fundamental.Csv" Version="[3.1.0,4.0.0)" /> | |
+ <PackageReference Include="Smdn.Fundamental.String.Replacement" Version="[3.0.2,4.0.0)" /> | |
+ <ProjectReference | |
+ Include="$(MSBuildThisFileDirectory)..\Smdn.Text.Ondulish.Dictionaries\Smdn.Text.Ondulish.Dictionaries.csproj" | |
+ VersionRange="[4.0.0,5.0.0)" | |
+ AdditionalProperties="IsBuildDueToProjectReference=true" | |
+ /> | |
+ </ItemGroup> | |
+ | |
+ <ItemGroup> | |
+ <!-- Third party notice --> | |
+ <None | |
+ Include="$(MSBuildThisFileDirectory)..\..\ThirdPartyNotices.md" | |
+ Pack="true" | |
+ PackagePath="ThirdPartyNotices.md" | |
+ CopyToOutputDirectory="None" | |
+ /> | |
+ | |
+ <!-- MeCab bindings source --> | |
+ <!-- | |
+ These files will be added by MeCab.targets during the MSBuild execution phase. | |
+ <Compile Include="$(MeCabBindingsDirectory)src\*.cs" /> | |
+ --> | |
+ | |
+ <!-- MeCab shared library --> | |
+ <None | |
+ Include="$(MeCabBindingsDirectory)runtimes\ubuntu.22.04-x64\native\libmecab.so" | |
+ Pack="true" | |
+ PackagePath="runtimes\ubuntu.22.04-x64\native\libmecab.so" | |
+ CopyToOutputDirectory="PreserveNewest" | |
+ /> | |
+ | |
+ <!-- MeCab IPA dictionary files --> | |
+ <Content Include="$(MeCabIpaDicDirectory)char.bin" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)dicrc" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)left-id.def" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)matrix.bin" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)pos-id.def" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)rewrite.def" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)right-id.def" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)sys.dic" /> | |
+ <Content Include="$(MeCabIpaDicDirectory)unk.dic" /> | |
+ | |
+ <Content | |
+ Update="$(MeCabIpaDicDirectory)*" | |
+ TargetPath="$(MeCabIpaDicDeploymentBasePath)$([System.IO.Path]::GetFileName('%(Identity)'))" | |
+ CopyToOutputDirectory="PreserveNewest" | |
+ Pack="true" | |
+ PackagePath="contentFiles\any\any\$([System.IO.Path]::TrimEndingDirectorySeparator('$(MeCabIpaDicDeploymentBasePath)'))" | |
+ PackageCopyToOutput="true" | |
+ /> | |
+ | |
+ <!-- MeCab configuration file --> | |
+ <Content | |
+ Include="null.mecabrc" | |
+ TargetPath="$(MeCabDeploymentBasePath)%(Filename)%(Extension)" | |
+ CopyToOutputDirectory="PreserveNewest" | |
+ Pack="true" | |
+ PackagePath="contentFiles\any\any\$([System.IO.Path]::TrimEndingDirectorySeparator('$(MeCabDeploymentBasePath)'))" | |
+ PackageCopyToOutput="true" | |
+ /> | |
+ </ItemGroup> | |
+ | |
+ <Target Name="GenerateReadmeFileContent"> | |
+ <ItemGroup> | |
+ <_SupportedRuntimeIdentifier Include="$(RuntimeIdentifiers)" /> | |
+ <_SupportedRuntimeIdentifierMarkdownList Include="- `%(_SupportedRuntimeIdentifier.Identity)`" /> | |
+ </ItemGroup> | |
+ | |
+ <PropertyGroup> | |
+ <PackageReadmeFileContent><![CDATA[# $(AssemblyName)-$(InformationalVersion) | |
+$(Description) | |
+ | |
+## Usage | |
+```cs | |
+$([System.IO.File]::ReadAllText('$(MSBuildThisFileDirectory)..\..\examples\hello-ondulish-world\Program.cs')) | |
+``` | |
+ | |
+## MeCab bindings for .NET | |
+This package includes [MeCab](https://github.com/taku910/mecab.git) bindings for .NET and supports the platforms represented by the following [RID](https://learn.microsoft.com/dotnet/core/rid-catalog)s. | |
+@(_SupportedRuntimeIdentifierMarkdownList, '%0A') | |
+ | |
+For other platforms, a wrapper library for the bindings must be built and deployed separately. | |
+]]></PackageReadmeFileContent> | |
+ </PropertyGroup> | |
+ | |
+ <!-- append licence notice to package readme --> | |
+ <PropertyGroup> | |
+ <ThirdPartyNoticesMarkdownText>$([System.IO.File]::ReadAllText('$(MSBuildThisFileDirectory)..\..\ThirdPartyNotices.md'))</ThirdPartyNoticesMarkdownText> | |
+ <PackageReadmeFileContentPostamble>$(PackageReadmeFileContentPostamble)$(ThirdPartyNoticesMarkdownText)</PackageReadmeFileContentPostamble> | |
+ </PropertyGroup> | |
+ </Target> | |
+ | |
+ <ImportGroup> | |
+ <Import Project="$(MSBuildThisFileDirectory)MeCab.targets" /> | |
+ </ImportGroup> | |
+ | |
+</Project> | |
diff --git a/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/KanaUtils.cs b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/KanaUtils.cs | |
new file mode 100644 | |
index 0000000..2092ef7 | |
--- /dev/null | |
+++ b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/KanaUtils.cs | |
@@ -0,0 +1,114 @@ | |
+// SPDX-FileCopyrightText: 2012 smdn <smdn@smdn.jp> | |
+// SPDX-License-Identifier: MIT | |
+ | |
+using System; | |
+using System.Text; | |
+ | |
+namespace Smdn.Text.Ondulish; | |
+ | |
+public static class KanaUtils { | |
+ private const char WideHiraganaStart = '\u3041'; | |
+ private const char WideHiraganaEnd = '\u3096'; | |
+ | |
+ private const char WideKatakanaStart = '\u30a1'; | |
+ private const char WideKatakanaEnd = '\u30f6'; | |
+ | |
+ private const int OffsetFromHiraganaToKatakana = WideKatakanaStart - WideHiraganaStart; | |
+ | |
+ private const char WideKatakanaExEnd = '\u30fa'; | |
+ | |
+ private static readonly string[] WideToNarrowKatakanaMap = new[] { | |
+ "ァ", "ア", "ィ", "イ", "ゥ", "ウ", "ェ", "エ", "ォ", "オ", "カ", "ガ", "キ", "ギ", "ク", // 30A1 - 30AF | |
+ "グ", "ケ", "ゲ", "コ", "ゴ", "サ", "ザ", "シ", "ジ", "ス", "ズ", "セ", "ゼ", "ソ", "ゾ", "タ", // 30B0 - 30BF | |
+ "ダ", "チ", "ヂ", "ッ", "ツ", "ヅ", "テ", "デ", "ト", "ド", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", // 30C0 - 30CF | |
+ "バ", "パ", "ヒ", "ビ", "ピ", "フ", "ブ", "プ", "ヘ", "ベ", "ペ", "ホ", "ボ", "ポ", "マ", "ミ", // 30D0 - 30DF | |
+ "ム", "メ", "モ", "ャ", "ヤ", "ュ", "ユ", "ョ", "ヨ", "ラ", "リ", "ル", "レ", "ロ", "ヮ", "ワ", // 30E0 - 30EF | |
+ "ヰ", "ヱ", "ヲ", "ン", "ヴ", "ヵ", "ヶ", "ヷ", "ヸ", "ヹ", "ヺ", // 30F0 - 30FA | |
+ }; | |
+ | |
+ public static string ConvertWideHiraganaToKatakana(string input) | |
+ { | |
+ if (input is null) | |
+ throw new ArgumentNullException(nameof(input)); | |
+ if (input.Length == 0) | |
+ return string.Empty; | |
+ | |
+#if SYSTEM_STRING_CREATE | |
+ return string.Create(input.Length, input, static (chars, s) => { | |
+ for (var index = 0; index < chars.Length; index++) { | |
+ chars[index] = s[index] is >= WideHiraganaStart and <= WideHiraganaEnd | |
+ ? (char)(s[index] + OffsetFromHiraganaToKatakana) | |
+ : s[index]; | |
+ } | |
+ }); | |
+#else | |
+ var outputChars = new char[input.Length]; | |
+ | |
+ for (var index = 0; index < input.Length; index++) { | |
+ outputChars[index] = input[index] is >= WideHiraganaStart and <= WideHiraganaEnd | |
+ ? (char)(input[index] + OffsetFromHiraganaToKatakana) | |
+ : outputChars[index] = input[index]; | |
+ } | |
+ | |
+ return new string(outputChars); | |
+#endif | |
+ } | |
+ | |
+ public static string ConvertWideKatakanaToHiragana(string input) | |
+ { | |
+ if (input is null) | |
+ throw new ArgumentNullException(nameof(input)); | |
+ if (input.Length == 0) | |
+ return string.Empty; | |
+ | |
+#if SYSTEM_STRING_CREATE | |
+ return string.Create(input.Length, input, static (chars, s) => { | |
+ for (var index = 0; index < chars.Length; index++) { | |
+ chars[index] = s[index] is >= WideKatakanaStart and <= WideKatakanaEnd | |
+ ? (char)(s[index] - OffsetFromHiraganaToKatakana) | |
+ : s[index]; | |
+ } | |
+ }); | |
+#else | |
+ var outputChars = new char[input.Length]; | |
+ | |
+ for (var index = 0; index < input.Length; index++) { | |
+ outputChars[index] = input[index] is >= WideKatakanaStart and <= WideKatakanaEnd | |
+ ? (char)(input[index] - OffsetFromHiraganaToKatakana) | |
+ : input[index]; | |
+ } | |
+ | |
+ return new string(outputChars); | |
+#endif | |
+ } | |
+ | |
+ public static string ConvertWideKatakanaToNarrowKatakana(string input) | |
+ { | |
+ if (input is null) | |
+ throw new ArgumentNullException(nameof(input)); | |
+ if (input.Length == 0) | |
+ return string.Empty; | |
+ | |
+ var output = new StringBuilder(input.Length * 2); | |
+ | |
+ for (var index = 0; index < input.Length; index++) { | |
+ output.Append( | |
+ input[index] switch { | |
+ >= WideKatakanaStart and <= WideKatakanaExEnd => WideToNarrowKatakanaMap[input[index] - WideKatakanaStart], | |
+ 'ー' => 'ー', | |
+ '゛' => '゙', | |
+ '゜' => '゚', | |
+ '?' => '?', | |
+ '!' => '!', | |
+ '、' => '、', | |
+ '。' => '。', | |
+ ',' => ',', | |
+ '.' => '.', | |
+ _ => input[index], | |
+ } | |
+ ); | |
+ } | |
+ | |
+ return output.ToString(); | |
+ } | |
+} | |
diff --git a/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/ReadOnlyOrderedDictionary.cs b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/ReadOnlyOrderedDictionary.cs | |
new file mode 100644 | |
index 0000000..6519550 | |
--- /dev/null | |
+++ b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/ReadOnlyOrderedDictionary.cs | |
@@ -0,0 +1,43 @@ | |
+// SPDX-FileCopyrightText: 2012 smdn <smdn@smdn.jp> | |
+// SPDX-License-Identifier: MIT | |
+ | |
+using System; | |
+using System.Collections; | |
+using System.Collections.Generic; | |
+using System.Linq; | |
+ | |
+namespace Smdn.Text.Ondulish; | |
+ | |
+internal sealed class ReadOnlyOrderedDictionary<TKey, TValue> : IReadOnlyDictionary<TKey, TValue> { | |
+ private readonly IReadOnlyList<KeyValuePair<TKey, TValue>> dictionary; | |
+ | |
+ public TValue this[TKey key] => throw new NotImplementedException(); | |
+ public IEnumerable<TKey> Keys => throw new NotImplementedException(); | |
+ public IEnumerable<TValue> Values => throw new NotImplementedException(); | |
+ public int Count => dictionary.Count; | |
+ | |
+ public ReadOnlyOrderedDictionary(IEnumerable<(TKey Key, TValue Value)> dictionary) | |
+ : this( | |
+ (dictionary ?? throw new ArgumentNullException(nameof(dictionary))) | |
+ .Select(static pair => new KeyValuePair<TKey, TValue>(pair.Key, pair.Value)) | |
+ .ToList() | |
+ ) | |
+ { } | |
+ | |
+ public ReadOnlyOrderedDictionary(IReadOnlyList<KeyValuePair<TKey, TValue>> dictionary) | |
+ { | |
+ this.dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); | |
+ } | |
+ | |
+ public bool ContainsKey(TKey key) | |
+ => throw new NotImplementedException(); | |
+ | |
+ public IEnumerator<KeyValuePair<TKey, TValue>> GetEnumerator() | |
+ => dictionary.GetEnumerator(); | |
+ | |
+ public bool TryGetValue(TKey key, out TValue value) | |
+ => throw new NotImplementedException(); | |
+ | |
+ IEnumerator IEnumerable.GetEnumerator() | |
+ => dictionary.GetEnumerator(); | |
+} | |
diff --git a/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/Translator.Dictionaries.cs b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/Translator.Dictionaries.cs | |
new file mode 100644 | |
index 0000000..7c82c31 | |
--- /dev/null | |
+++ b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/Translator.Dictionaries.cs | |
@@ -0,0 +1,127 @@ | |
+// SPDX-FileCopyrightText: 2012 smdn <smdn@smdn.jp> | |
+// SPDX-License-Identifier: MIT | |
+ | |
+using System; | |
+using System.Collections.Generic; | |
+using System.IO; | |
+using System.Text; | |
+ | |
+using Smdn.Formats.Csv; | |
+ | |
+namespace Smdn.Text.Ondulish; | |
+ | |
+#pragma warning disable IDE0040 | |
+partial class Translator { | |
+#pragma warning restore IDE0040 | |
+ | |
+ public IReadOnlyDictionary<string, string> PhraseDictionary { get; } | |
+ public IReadOnlyDictionary<string, string> WordDictionary { get; } | |
+ | |
+ private sealed class WordDictionaryComparer : IComparer<string> { | |
+ public int Compare(string? x, string? y) | |
+ { | |
+ x ??= string.Empty; | |
+ y ??= string.Empty; | |
+ | |
+ return x.Length == y.Length | |
+ ? StringComparer.Ordinal.Compare(x, y) | |
+ : y.Length - x.Length; | |
+ } | |
+ } | |
+ | |
+ private static readonly char[] dictionaryPunctuationChars = new[] { '!', '?', '!', '?', '、', '。' }; | |
+ | |
+ private static SortedList<string, string> LoadDictionary(Stream stream) | |
+ { | |
+ var dictionary = new SortedList<string, string>(new WordDictionaryComparer()); | |
+ | |
+ using var reader = new CsvReader(stream, Encoding.UTF8); | |
+ | |
+ foreach (var entries in reader.ReadRecords()) { | |
+ if (entries.Count < 3) | |
+ continue; | |
+ | |
+ var entry = entries[0].Trim(); | |
+ | |
+ if (entry.StartsWith('#')) | |
+ continue; // comment line | |
+ | |
+ var key = entries[1].Trim().RemoveChars(dictionaryPunctuationChars); | |
+ | |
+ dictionary[KanaUtils.ConvertWideHiraganaToKatakana(key)] = entries[2].Trim(); | |
+ } | |
+ | |
+ return dictionary; | |
+ } | |
+ | |
+ private static readonly IReadOnlyDictionary<string, string> phonemeDictionary = | |
+ new ReadOnlyOrderedDictionary<string, string>( | |
+ new[] { | |
+ // 最優先 | |
+ ("ル", "ドゥ"), | |
+ ("ム", "ヴ"), | |
+ ("ボー", "ポッ"), | |
+ ("ドー", "ドゥー"), | |
+ ("スナ", "スダ"), | |
+ ("スルナ", "ドゥルダ"), | |
+ ("スル", "ドゥル"), | |
+ ("デモ", "デロ"), | |
+ ("ンヤ", "ッニャ"), | |
+ ("ネイ", "ニッ"), | |
+ ("ネエ", "ニェ"), | |
+ ("デス", "ディス"), | |
+ ("ウラ", "ルラ"), | |
+ ("トオ", "ドーゥ"), | |
+ ("いじゃ", "チョナ"), | |
+ ("とは", "トヴァ"), | |
+ | |
+ // 母音 | |
+ ("ア", "ア゛"), | |
+ ("ウ", "ル"), | |
+ ("ヤ", "ャ"), | |
+ | |
+ // 摩擦音 | |
+ ("サ", "ザァ"), | |
+ ("ス", "ズ"), | |
+ ("ゼ", "デ"), | |
+ | |
+ ("ハ", "ヴァ"), | |
+ ("ヒ", "ビィ"), | |
+ ("フ", "ヴ"), | |
+ ("ヘ", "ベ"), | |
+ ("ホ", "ボ"), | |
+ | |
+ ("ブ", "ム"), | |
+ | |
+ ("ゼ", "デ"), | |
+ | |
+ // 破裂音 | |
+ ("ク", "グ"), | |
+ ("キ", "ク"), | |
+ | |
+ ("タ", "ダ"), | |
+ ("チ", "ディ"), | |
+ ("ツ", "ヅ"), | |
+ ("テ", "デ"), | |
+ ("ト", "ドゥ"), | |
+ | |
+ ("ピ", "ヴィ"), | |
+ | |
+ // 鼻音 | |
+ ("ニ", "ディ"), | |
+ ("ヌ", "ズ"), | |
+ ("ネ", "ベ"), | |
+ ("ノ", "ド"), | |
+ | |
+ ("マ", "バ"), | |
+ ("ミ", "ヴィ"), | |
+ ("メ", "ベ"), | |
+ ("モ", "ボ"), | |
+ | |
+ // 流音 | |
+ ("リ", "ディ"), | |
+ ("レ", "リ"), | |
+ ("ロ", "ド"), | |
+ } | |
+ ); | |
+} | |
diff --git a/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/Translator.cs b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/Translator.cs | |
new file mode 100644 | |
index 0000000..6f87100 | |
--- /dev/null | |
+++ b/src/Smdn.Text.Ondulish/Smdn.Text.Ondulish/Translator.cs | |
@@ -0,0 +1,285 @@ | |
+// SPDX-FileCopyrightText: 2012 smdn <smdn@smdn.jp> | |
+// SPDX-License-Identifier: MIT | |
+ | |
+using System; | |
+using System.Collections.Generic; | |
+using System.IO; | |
+using System.Linq; | |
+using System.Reflection; | |
+using System.Text; | |
+ | |
+using MeCab; | |
+ | |
+using MeCabConsts = MeCab.MeCab; | |
+ | |
+namespace Smdn.Text.Ondulish; | |
+ | |
+public partial class Translator : IDisposable { | |
+ private const string MeCabDeploymentDirectory = "mecab"; | |
+ private const bool ConvertKatakanaToNarrowDefaultValue = true; | |
+ | |
+ private Tagger? tagger; | |
+ private readonly bool shouldDisposeTagger; | |
+ | |
+ private void ThrowIfDisposed() | |
+ { | |
+ if (tagger is null) | |
+ throw new ObjectDisposedException(GetType().FullName); | |
+ } | |
+ | |
+ public static Tagger CreateTaggerForBundledDictionary() | |
+ { | |
+ var assemblyDirectory = System.IO.Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); | |
+ var mecabDeploymentDirectoryPath = string.IsNullOrEmpty(assemblyDirectory) | |
+ ? MeCabDeploymentDirectory // fallback: use relative path from current directory | |
+ : System.IO.Path.Join(assemblyDirectory, MeCabDeploymentDirectory); | |
+ | |
+ var pathToMeCabResourceFile = System.IO.Path.Join(mecabDeploymentDirectoryPath, "null.mecabrc"); | |
+ var pathToMeCabDictionaryDirectory = System.IO.Path.Join(mecabDeploymentDirectoryPath, "dic", "ipadic"); | |
+ | |
+ var taggerArgs = $"-r {pathToMeCabResourceFile} -d {pathToMeCabDictionaryDirectory}"; | |
+ | |
+ return new Tagger(taggerArgs); | |
+ } | |
+ | |
+ public Translator() | |
+ : this( | |
+ tagger: CreateTaggerForBundledDictionary(), | |
+ shouldDisposeTagger: true | |
+ ) | |
+ { | |
+ } | |
+ | |
+ public Translator( | |
+ Tagger tagger, | |
+ bool shouldDisposeTagger | |
+ ) | |
+ { | |
+ if (tagger is null) | |
+ throw new ArgumentNullException(nameof(tagger)); | |
+ | |
+ this.tagger = tagger; | |
+ this.shouldDisposeTagger = shouldDisposeTagger; | |
+ | |
+ // load Ondulish dictionaries from assembly Smdn.Text.Ondulish.Dictionaries | |
+ try { | |
+ using var stream = OndulishDictionaries.OpenPhraseDictionaryStream(); | |
+ | |
+ PhraseDictionary = LoadDictionary(stream); | |
+ } | |
+ catch { | |
+ // ignore exceptions | |
+ PhraseDictionary = CreateEmptyDictionary(); | |
+ } | |
+ | |
+ try { | |
+ using var stream = OndulishDictionaries.OpenWordDictionaryStream(); | |
+ | |
+ WordDictionary = LoadDictionary(stream); | |
+ } | |
+ catch { | |
+ // ignore exceptions | |
+ WordDictionary = CreateEmptyDictionary(); | |
+ } | |
+ | |
+ static IReadOnlyDictionary<string, string> CreateEmptyDictionary() | |
+ => Enumerable.Empty<(string Key, string Value)>().ToDictionary(static pair => pair.Key, static pair => pair.Value); | |
+ } | |
+ | |
+ public void Dispose() | |
+ { | |
+ Dispose(disposing: true); | |
+ GC.SuppressFinalize(this); | |
+ } | |
+ | |
+ protected virtual void Dispose(bool disposing) | |
+ { | |
+ if (shouldDisposeTagger && tagger is not null) | |
+ tagger.Dispose(); | |
+ | |
+ tagger = null; | |
+ } | |
+ | |
+ public string Translate( | |
+ string input, | |
+ bool convertKatakanaToNarrow = ConvertKatakanaToNarrowDefaultValue | |
+ ) | |
+ { | |
+ if (input is null) | |
+ throw new ArgumentNullException(nameof(input)); | |
+ | |
+ ThrowIfDisposed(); | |
+ | |
+ if (input.Length == 0) | |
+ return string.Empty; | |
+ | |
+ var sb = new StringBuilder(input.Length * 2); | |
+ | |
+ Translate( | |
+ input: new StringReader(input), | |
+ output: new StringWriter(sb), | |
+ convertKatakanaToNarrow: convertKatakanaToNarrow | |
+ ); | |
+ | |
+ return sb.ToString(); | |
+ } | |
+ | |
+ public void Translate( | |
+ string input, | |
+ TextWriter output, | |
+ bool convertKatakanaToNarrow = ConvertKatakanaToNarrowDefaultValue | |
+ ) | |
+ => Translate( | |
+ input: new StringReader(input ?? throw new ArgumentNullException(nameof(input))), | |
+ output: output ?? throw new ArgumentNullException(nameof(output)), | |
+ convertKatakanaToNarrow: convertKatakanaToNarrow | |
+ ); | |
+ | |
+ public void Translate( | |
+ TextReader input, | |
+ TextWriter output, | |
+ bool convertKatakanaToNarrow = ConvertKatakanaToNarrowDefaultValue | |
+ ) | |
+ { | |
+ if (input is null) | |
+ throw new ArgumentNullException(nameof(input)); | |
+ if (output is null) | |
+ throw new ArgumentNullException(nameof(output)); | |
+ | |
+ ThrowIfDisposed(); | |
+ | |
+ var firstLine = true; | |
+ | |
+ for (var line = input.ReadLine(); line is not null; line = input.ReadLine()) { | |
+ if (firstLine) | |
+ firstLine = false; | |
+ else | |
+ output.WriteLine(); | |
+ | |
+ if (string.IsNullOrWhiteSpace(line)) { | |
+ output.Write(line); | |
+ continue; | |
+ } | |
+ | |
+ var fragments = | |
+ ConvertWithDictionary( | |
+ ConvertToKatakana(line), | |
+ PhraseDictionary | |
+ ) | |
+ .SelectMany(f => | |
+ f.ConvertedText is null | |
+ ? ConvertWithDictionary(f.SourceText, WordDictionary) | |
+ : Enumerable.Repeat(f, 1) | |
+ ) | |
+ .SelectMany(f => | |
+ f.ConvertedText is null | |
+ ? ConvertWithDictionary(f.SourceText, phonemeDictionary) | |
+ : Enumerable.Repeat(f, 1) | |
+ ) | |
+ .Select(static f => | |
+ new TextFragment( | |
+ f.SourceText, | |
+ f.ConvertedText ?? KanaUtils.ConvertWideHiraganaToKatakana(f.SourceText) // redundant? | |
+ ) | |
+ ); | |
+ | |
+ if (convertKatakanaToNarrow) { | |
+ fragments = fragments.Select(static f => | |
+ new TextFragment( | |
+ f.SourceText, | |
+ f.ConvertedText is null | |
+ ? null | |
+ : KanaUtils.ConvertWideKatakanaToNarrowKatakana(f.ConvertedText) | |
+ ) | |
+ ); | |
+ } | |
+ | |
+ foreach (var fragment in fragments) { | |
+ output.Write(fragment.ConvertedText); | |
+ } | |
+ } | |
+ | |
+ output.Flush(); | |
+ } | |
+ | |
+ private static readonly char[] featureSplitter = new[] { ',' }; | |
+ | |
+ private string ConvertToKatakana(string input) | |
+ { | |
+ input = input.Replace(",", ","); // XXX: feature splitter | |
+ | |
+ var ret = new StringBuilder(input.Length * 2); | |
+ | |
+ for (var node = tagger!.parseToNode(input); node != null; node = node.next) { | |
+ if (node.stat == MeCabConsts.MECAB_BOS_NODE || node.stat == MeCabConsts.MECAB_EOS_NODE) | |
+ continue; | |
+ | |
+ var featureEntries = node.feature.Split(featureSplitter); | |
+ | |
+ if (8 <= featureEntries.Length) { | |
+ switch (featureEntries[6]) { | |
+ case "ぶっ殺す": ret.Append("ブッコロス"); break; // ipadic says 'ぶっとばす' | |
+ default: ret.Append(featureEntries[7]); break; | |
+ } | |
+ } | |
+ else { | |
+ ret.Append(node.surface); | |
+ } | |
+ } | |
+ | |
+ return ret.ToString(); | |
+ } | |
+ | |
+ private readonly struct TextFragment { | |
+ public readonly string SourceText; | |
+ public readonly string? ConvertedText; | |
+ | |
+ public TextFragment(string sourceText, string? convertedText) | |
+ { | |
+ SourceText = sourceText; | |
+ ConvertedText = convertedText; | |
+ } | |
+ } | |
+ | |
+ private static bool FindMostLeftAndLongestCandidate( | |
+ string input, | |
+ int startIndex, | |
+ IReadOnlyDictionary<string, string> dictionary, | |
+ out int position, | |
+ out KeyValuePair<string, string> candidate | |
+ ) | |
+ { | |
+ position = int.MaxValue; | |
+ candidate = default; | |
+ | |
+ foreach (var entry in dictionary) { | |
+ var pos = input.IndexOf(entry.Key, startIndex, StringComparison.Ordinal); | |
+ | |
+ if (0 <= pos && pos < position) { | |
+ position = pos; | |
+ candidate = entry; | |
+ } | |
+ } | |
+ | |
+ return position != int.MaxValue; | |
+ } | |
+ | |
+ private static IEnumerable<TextFragment> ConvertWithDictionary( | |
+ string input, | |
+ IReadOnlyDictionary<string, string> dictionary | |
+ ) | |
+ { | |
+ var offset = 0; | |
+ | |
+ while (FindMostLeftAndLongestCandidate(input, offset, dictionary, out var position, out var candidate)) { | |
+ if (offset < position) | |
+ yield return new TextFragment(input.Substring(offset, position - offset), null); | |
+ | |
+ yield return new TextFragment(candidate.Key, candidate.Value); | |
+ | |
+ offset = position + candidate.Key.Length; | |
+ } | |
+ | |
+ yield return new TextFragment(input.Substring(offset), null); | |
+ } | |
+} | |
diff --git a/src/Smdn.Text.Ondulish/null.mecabrc b/src/Smdn.Text.Ondulish/null.mecabrc | |
new file mode 100644 | |
index 0000000..e69de29 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment