Skip to content

Instantly share code, notes, and snippets.

@zhuth
Created September 20, 2012 12:37
Show Gist options
  • Save zhuth/3755629 to your computer and use it in GitHub Desktop.
Save zhuth/3755629 to your computer and use it in GitHub Desktop.
use Bayesian average to find out hot words from frequency data
// written in CsSC
#reference System.Core.dll;
#using System.Linq;
var names = Directory.GetFiles("D:\\temp\\fq\\", "*.fq");
var words = new Dictionary<string, double>();
var ps = new Dictionary<string, double>();
var total_words = new Dictionary<string, int>();
int fid = 0;
Console.WriteLine("Reading...");
foreach(string name in names) {
Console.WriteLine(" " + name);
foreach(string line in File.ReadAllLines(name)) {
string[] cols = line.Split('\t');
if (cols.Length < 2) continue;
if (cols[0].Length < 2) continue;
string word = cols[0]; int freq = int.Parse(cols[1]);
if (total_words.ContainsKey(word)) total_words[word] += freq;
else total_words.Add(word, freq);
}
++fid;
}
using (var sw = new StreamWriter(@"D:\Temp\bavg.txt")) {
for (fid = 0; fid < names.Length; ++fid) {
Console.WriteLine(".." + names[fid]);
words = new Dictionary<string, double>();
foreach(string line in File.ReadAllLines(names[fid])) {
string[] cols = line.Split('\t');
if (cols.Length < 2) continue;
if (cols[0].Length < 2) continue;
string word = cols[0]; int freq = int.Parse(cols[1]);
if (words.ContainsKey(word)) words[word] += freq;
else words.Add(word, freq);
}
sw.WriteLine("* " + names[fid]);
ps = new Dictionary<string, double>();
double cnt = 0.0, total = 0, avg = 0.0;
foreach(KeyValuePair<string, double> pair in words) {
cnt++;
total += total_words[pair.Key];
avg += (double)(pair.Value) / total_words[pair.Key];
}
total /= cnt;
avg /= cnt;
var avg_times_total = total * avg;
foreach(string word in words.Keys) {
ps.Add(word, ((double)(words[word]) + avg_times_total) / (total_words[word] + total));
}
var word_list = from pair in words orderby pair.Value select pair.Key;
cnt = 0;
foreach(string word in word_list) {
sw.Write(word + " ");
cnt++;
if (cnt >= 30) break;
}
sw.WriteLine();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment