Created
September 20, 2012 12:37
-
-
Save zhuth/3755629 to your computer and use it in GitHub Desktop.
use Bayesian average to find out hot words from frequency data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// written in CsSC | |
#reference System.Core.dll; | |
#using System.Linq; | |
var names = Directory.GetFiles("D:\\temp\\fq\\", "*.fq"); | |
var words = new Dictionary<string, double>(); | |
var ps = new Dictionary<string, double>(); | |
var total_words = new Dictionary<string, int>(); | |
int fid = 0; | |
Console.WriteLine("Reading..."); | |
foreach(string name in names) { | |
Console.WriteLine(" " + name); | |
foreach(string line in File.ReadAllLines(name)) { | |
string[] cols = line.Split('\t'); | |
if (cols.Length < 2) continue; | |
if (cols[0].Length < 2) continue; | |
string word = cols[0]; int freq = int.Parse(cols[1]); | |
if (total_words.ContainsKey(word)) total_words[word] += freq; | |
else total_words.Add(word, freq); | |
} | |
++fid; | |
} | |
using (var sw = new StreamWriter(@"D:\Temp\bavg.txt")) { | |
for (fid = 0; fid < names.Length; ++fid) { | |
Console.WriteLine(".." + names[fid]); | |
words = new Dictionary<string, double>(); | |
foreach(string line in File.ReadAllLines(names[fid])) { | |
string[] cols = line.Split('\t'); | |
if (cols.Length < 2) continue; | |
if (cols[0].Length < 2) continue; | |
string word = cols[0]; int freq = int.Parse(cols[1]); | |
if (words.ContainsKey(word)) words[word] += freq; | |
else words.Add(word, freq); | |
} | |
sw.WriteLine("* " + names[fid]); | |
ps = new Dictionary<string, double>(); | |
double cnt = 0.0, total = 0, avg = 0.0; | |
foreach(KeyValuePair<string, double> pair in words) { | |
cnt++; | |
total += total_words[pair.Key]; | |
avg += (double)(pair.Value) / total_words[pair.Key]; | |
} | |
total /= cnt; | |
avg /= cnt; | |
var avg_times_total = total * avg; | |
foreach(string word in words.Keys) { | |
ps.Add(word, ((double)(words[word]) + avg_times_total) / (total_words[word] + total)); | |
} | |
var word_list = from pair in words orderby pair.Value select pair.Key; | |
cnt = 0; | |
foreach(string word in word_list) { | |
sw.Write(word + " "); | |
cnt++; | |
if (cnt >= 30) break; | |
} | |
sw.WriteLine(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment