Created
September 1, 2012 08:14
-
-
Save zhuth/3567022 to your computer and use it in GitHub Desktop.
尝试用这篇post: http://www.matrix67.com/blog/archives/5044 中的方法实现的一个自动中文抽词算法的 C# 程序,大量参考了 @lastland 兄的 Python 实现,速度应该可以快一点。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// 这是一个 CsSC 脚本文件,要使用标准 C#,请去掉 #function 和 #endfunction 这两个指令,并将其余代码放在 Main 函数中,接受 string[] args 为其参数。 | |
// 要直接运行,请移步 https://github.com/zhuth/Tools/ ,下载后解压 bin\CsSC.exe 和相关文件。 | |
#reference System.Core.dll | |
#function | |
public class WordDetector { | |
public Action ProcessOver = null; | |
internal struct CharPos { | |
public char ThisChar; | |
public bool PositionOnRight; | |
public CharPos(char value, bool positionOnRight) { | |
this.ThisChar = value; this.PositionOnRight = positionOnRight; | |
} | |
} | |
public const int MaxWordLength = 5, | |
MinFreq = 10; | |
public const double PSvPThreshold = 100, | |
EntropyThreshold = 1.0; | |
HashSet<string> finalWords = new HashSet<string>(); | |
Dictionary<string, Dictionary<CharPos, int>> words = new Dictionary<string, Dictionary<CharPos, int>>(); | |
Dictionary<string, int> freq = new Dictionary<string, int>(); | |
Dictionary<string, double> ps = new Dictionary<string, double>(); | |
Regex regSplit = new Regex(@"\W+|[a-zA-Z0-9]+", RegexOptions.Compiled | RegexOptions.Multiline); | |
StreamReader sr = null; | |
int total = 0; | |
string _filename = ""; | |
public HashSet<string> FinalWords { | |
get { | |
return finalWords; | |
} | |
} | |
public Dictionary<string, int> Freq { | |
get { | |
return freq; | |
} | |
} | |
public WordDetector (string filename) | |
{ | |
_filename = filename; | |
renewStreamReader(); | |
} | |
private void renewStreamReader () { | |
sr = new StreamReader(_filename, CsSC.EncodingType.GetType(_filename)); | |
} | |
public void StartProcess () | |
{ | |
System.Threading.Thread thr = new System.Threading.Thread(new System.Threading.ThreadStart(Process)); | |
thr.Start(); | |
} | |
private void wordInfoEntropy (string word, out double leftEntropy, out double rightEntropy) | |
{ | |
leftEntropy = rightEntropy = 0; | |
double totalL = 0, totalR = 0; | |
foreach (KeyValuePair<CharPos, int> pair in words[word]) { | |
if (pair.Key.PositionOnRight) totalR += pair.Value; else totalL += pair.Value; | |
} | |
if (totalL <= 0) leftEntropy = double.MaxValue; | |
if (totalR <= 0) rightEntropy = double.MaxValue; | |
foreach (KeyValuePair<CharPos, int> pair in words[word]) { | |
double p; | |
if (pair.Key.PositionOnRight) { | |
p = (double)pair.Value / totalR; | |
rightEntropy -= p * Math.Log(p); | |
} else { | |
p = (double)pair.Value / totalL; | |
leftEntropy -= p * Math.Log(p); | |
} | |
} | |
} | |
public void Process () | |
{ | |
Console.WriteLine("Reading input..."); | |
string line = ""; | |
while ((line = sr.ReadLine()) != null) { | |
total += addParagraph (line); | |
} | |
finalizeParagraph (); | |
sr.Close (); | |
Console.WriteLine("Building candidate word list..."); | |
foreach (KeyValuePair<string, double> pair in ps) { | |
if (pair.Key.Length < 2 || pair.Key.Length > MaxWordLength) | |
continue; | |
double p = 0; | |
for (int i=1; i<pair.Key.Length; ++i) { | |
double t = ps [pair.Key.Substring (0, i)] * ps [pair.Key.Substring (i)]; | |
p = Math.Max (p, t); | |
} | |
if (freq [pair.Key] >= MinFreq && pair.Value / p > PSvPThreshold) | |
words.Add (pair.Key, new Dictionary<CharPos, int>()); | |
} | |
renewStreamReader (); | |
Console.WriteLine("Preparing word/adjacent character list..."); | |
foreach(string cword in freq.Keys) { | |
string wl = cword.Length > 1 ? cword.Substring(1) : "", | |
wr = cword.Length > 1 ? cword.Substring(0, cword.Length - 1) : "", | |
wc = cword.Length > 2 ? cword.Substring(1, cword.Length - 1) : ""; | |
CharPos c = new CharPos('a', false); int frq = freq[cword]; | |
if (words.ContainsKey(wl)) { | |
c = new CharPos(cword[0], false); | |
if (words[wl].ContainsKey(c)) words[wl][c] += frq; else words[wl].Add(c, frq); | |
} | |
if (words.ContainsKey(wr)) { | |
c = new CharPos(cword[cword.Length - 1], true); | |
if (words[wr].ContainsKey(c)) words[wr][c] += frq; else words[wr].Add(c, frq); | |
} | |
if (words.ContainsKey(wc)) { | |
c = new CharPos(cword[0], false); | |
if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq); | |
c = new CharPos(cword[cword.Length - 1], true); | |
if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq); | |
} | |
} | |
Console.WriteLine("Calculating word information entropy..."); | |
foreach (string word in words.Keys) { | |
double leftEntropy = 0, rightEntropy = 0; | |
wordInfoEntropy(word, out leftEntropy, out rightEntropy); | |
if (leftEntropy < EntropyThreshold || rightEntropy < EntropyThreshold) | |
continue; | |
finalWords.Add(word); | |
} | |
Console.WriteLine("Done. Writing results."); | |
if (ProcessOver != null) | |
ProcessOver.Invoke(); | |
} | |
private int addParagraph (string paragraph) | |
{ | |
int incr_total = 0; | |
foreach (string sentence in regSplit.Split(paragraph)) { | |
if (sentence.Length < 2) | |
continue; | |
for (int i = 0; i<sentence.Length; ++i) { | |
for (int j = 1; j<=MaxWordLength+2 && i+j-1<sentence.Length; ++j) { | |
string word = sentence.Substring (i, j); | |
if (!freq.ContainsKey(word)) freq.Add(word, 0); | |
freq [word]++; | |
++incr_total; | |
} | |
} | |
} | |
return incr_total; | |
} | |
private void finalizeParagraph () | |
{ | |
foreach (string key in freq.Keys) | |
ps.Add (key, (double)freq [key] / total); | |
} | |
public void Close () | |
{ | |
sr.Close(); | |
} | |
} | |
static WordDetector wordDetector = null; | |
static StreamWriter sw = null; | |
private static void PrintResults () | |
{ | |
if (sw == null) return; | |
foreach (string word in wordDetector.FinalWords) { | |
sw.WriteLine("{0}\t{1}", word, wordDetector.Freq[word]); | |
} | |
} | |
#endfunction | |
if (args.Length < 2) { | |
Console.WriteLine("Usage: worddetector <input> <freq output>"); | |
return; | |
} | |
wordDetector = new WordDetector(args[0]); | |
sw = new StreamWriter(args[1]); | |
wordDetector.Process(); | |
PrintResults(); | |
sw.Flush(); sw.Close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment