Last active
August 29, 2015 14:21
-
-
Save qianlifeng/cd43f829958289adbcea to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace Bayes | |
{ | |
public class Bayes | |
{ | |
private static List<NameFrequency> nameFrequencies = new List<NameFrequency>(); | |
public static void Initialize() | |
{ | |
//get charfreq.csv from https://gist.github.com/qianlifeng/c4470544c8f953043ac9 | |
var reader = new StreamReader(File.OpenRead(@"charfreq.csv")); | |
//skip first line | |
reader.ReadLine(); | |
while (!reader.EndOfStream) | |
{ | |
var line = reader.ReadLine(); | |
if (!string.IsNullOrEmpty(line)) | |
{ | |
var values = line.Split(','); | |
nameFrequencies.Add(new NameFrequency(values[0], int.Parse(values[1]), int.Parse(values[2]))); | |
} | |
} | |
} | |
public static GuessResult Guess(string name) | |
{ | |
// P(gender=男|name=本山) | |
// = P(name=本山|gender=男) * P(gender=男) / P(name=本山) | |
// = P(name has 本|gender=男) * P(name has 山|gender=男) * P(gender=男) / P(name=本山) | |
var fullName = name; | |
//除掉姓氏干扰 | |
name = name.Substring(1); | |
//算该名字是男生的概率 | |
double pMale = 1; | |
foreach (char c in name) | |
{ | |
NameFrequency cFrequency = nameFrequencies.FirstOrDefault(o => o.Name == c.ToString()); | |
if (cFrequency != null) | |
{ | |
pMale *= (double)cFrequency.MaleCount / nameFrequencies.Sum(o => o.MaleCount); | |
} | |
} | |
double pM = nameFrequencies.Sum(o => o.MaleCount) / | |
(double)nameFrequencies.Sum(o => o.MaleCount + o.FeMaleCount); | |
pMale *= pM; | |
//算该名字是女生的概率 | |
double pFeMale = 1; | |
foreach (char c in name) | |
{ | |
NameFrequency cFrequency = nameFrequencies.FirstOrDefault(o => o.Name == c.ToString()); | |
if (cFrequency != null) | |
{ | |
pFeMale *= (double)cFrequency.FeMaleCount / nameFrequencies.Sum(o => o.FeMaleCount); | |
} | |
} | |
pFeMale *= nameFrequencies.Sum(o => o.FeMaleCount) / (double)nameFrequencies.Sum(o => o.MaleCount + o.FeMaleCount); | |
return new GuessResult() | |
{ | |
Name = fullName, | |
IsMale = pMale > pFeMale, | |
Probability = (pMale > pFeMale ? pMale : pFeMale) / (pMale + pFeMale) | |
}; | |
} | |
} | |
public class NameFrequency | |
{ | |
public NameFrequency() | |
{ | |
} | |
public NameFrequency(string name, int maleCount, int femaleCount) | |
{ | |
Name = name; | |
MaleCount = maleCount; | |
FeMaleCount = femaleCount; | |
} | |
public string Name { get; set; } | |
public int MaleCount { get; set; } | |
public int FeMaleCount { get; set; } | |
} | |
public class GuessResult | |
{ | |
public string Name { get; set; } | |
public bool IsMale { get; set; } | |
public double Probability { get; set; } | |
public override string ToString() | |
{ | |
return "姓名:" + Name + ",性别:" + (IsMale ? "男性" : "女性") + ",概率:" + Math.Round(Probability, 2) * 100 + "%"; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment