Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save lovemyliwu/13649606b497293daf24d8b60b267dfa to your computer and use it in GitHub Desktop.
Save lovemyliwu/13649606b497293daf24d8b60b267dfa to your computer and use it in GitHub Desktop.
// 基尼指数
/*
>基尼指数度量的是数据集的不纯度,变量x,取值有n种,每一种概率Pi,那么X的基尼指数Gini(x)=1-∑Pi^2
>分类系统中类别就是变量,那么分类系统的基尼指数就是上面的Gini(x),Pi就是每种类别出现的概率
>特征T有n种取值,当T取Ti的值时,分类系统的基尼指数表示为Gini(x|T=Ti)=1-∑Pi^2,Pi是使用Ti过滤出的分类子集中类别取值的概率
>特征T在分类系统中总的基尼指数Gini(x|T)=Pi*Gini(x|T=Ti),Pi是Ti的取值概率
*/
var getGiniIndex = function (list) {
// `list` must be like this:
// [6, 8] or [1, 2, 3]
// the length means value types, exp. [1, 2] means there are 2 value types
// the first type appear 1 time, and the second type appear 2 times
// Returns gini index
var timesCount = 0, gini = 1, p;
for (var idx = 0; idx < list.length; idx++) {
timesCount += list[idx];
}
for (var idx = 0; idx < list.length; idx++) {
p = list[idx] / timesCount;
gini -= p * p;
}
return gini;
};
var getListGini = function(list) {
// `list` must be like this:
// [x1, x2, x1, x1, x2]
// Returns list gini index
var valueCount = {}
, timesList = [];
for (var idx = 0; idx < list.length; idx++) {
if (valueCount[list[idx]]) {
valueCount[list[idx]] += 1;
} else {
valueCount[list[idx]] = 1;
}
}
for (var value in valueCount) {
timesList.push(valueCount[value]);
}
return getGiniIndex(timesList);
};
var getCGiniForLastFeature = function(matrix, subIdx) {
// `matrix` must be like this:
// [
// [1, 1, 2, 3],
// [1, 1, 1, 3]
// ]
// `subIdx` is the feature idx
// Returns Conditional Gini Index on the `subIdx` feature
var cGini = 0, valueCount = {}, value, p, subMatrix, subList, gini;
for (var idx = 0; idx < matrix.length; idx++) {
value = matrix[idx][subIdx];
if (valueCount[value]) {
valueCount[value] += 1;
} else {
valueCount[value] = 1;
}
}
for (value in valueCount) {
p = valueCount[value] / matrix.length;
subMatrix = getMatrixByConditional(matrix, subIdx, value);
subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1);
gini = getListGini(subList);
cGini += p * gini;
}
return cGini;
};
// 信息增益值
/*
>信息熵就是信息量,变量x,取值有n种,每一种概率Pi,那么X的熵就定义为:H(x)=-∑Pi*log2^Pi
>分类系统中类别就是变量,那么分类系统的信息熵就是上面的H(x),Pi就是每种类别出现的概率
>特征T有n种取值,当T取Ti的值时,分类系统的信息熵表示为H(x|T=Ti),Ti出现的概率为Pi,则有条件熵H(x|T)=∑Pi*H(x|Ti)
>特征T的信息增益表示为IG(T)=H(x)-H(x|T)
*/
var getInformationEntropy = function(list) {
// `list` must be like this:
// [6, 8] or [1, 2, 3]
// the length means value types, exp. [1, 2] means there are 2 value types
// the first type appear 1 time, and the second type appear 2 times
// Returns information entropy
var timesCount = 0, ie = 0, p;
for (var idx = 0; idx < list.length; idx++) {
timesCount += list[idx];
}
for (var idx = 0; idx < list.length; idx++) {
p = list[idx] / timesCount;
ie -= p * Math.log2(p);
}
return ie;
};
var getListIE = function(list) {
// `list` must be like this:
// [x1, x2, x1, x1, x2]
// Returns list Information Entropy
var valueCount = {}
, timesList = [];
for (var idx = 0; idx < list.length; idx++) {
if (valueCount[list[idx]]) {
valueCount[list[idx]] += 1;
} else {
valueCount[list[idx]] = 1;
}
}
for (var value in valueCount) {
timesList.push(valueCount[value]);
}
return getInformationEntropy(timesList);
};
var getCIEForLastFeature = function(matrix, subIdx) {
// `matrix` must be like this:
// [
// [1, 1, 2, 3],
// [1, 1, 1, 3]
// ]
// `subIdx` is the feature idx
// Returns Conditional Information Entropy on the `subIdx` feature
var cIE = 0, valueCount = {}, value, p, subMatrix, subList, ie;
for (var idx = 0; idx < matrix.length; idx++) {
value = matrix[idx][subIdx];
if (valueCount[value]) {
valueCount[value] += 1;
} else {
valueCount[value] = 1;
}
}
for (value in valueCount) {
p = valueCount[value] / matrix.length;
subMatrix = getMatrixByConditional(matrix, subIdx, value);
subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1);
ie = getListIE(subList);
cIE += p * ie;
}
return cIE;
};
// ===tools===
var getMatrixByConditional = function(matrix, subIdx, value) {
// `matrix` must be like this:
// [[1, 1], [1, 0]] which means first feature only got 1 value type:1,
// and second feature got 2 value type: 1 or 0
// `subIdx` must be a idx number to fingure out which feature you want to filtering
// `value` must be a valid type to you specifield feature
// Returns sub matrix filter by conditional
var subMatrix = [];
for (var idx = 0; idx < matrix.length; idx++) {
if (value == matrix[idx][subIdx]) {
subMatrix.push(matrix[idx]);
}
}
return subMatrix;
};
var getListFromMatrix = function(matrix, subIdx) {
// `matrix` must be like this:
// [
// [1, 1, 2, 3],
// [1, 1, 1, 3]
// ]
// `subIdx` is the number which you want to filtering out from each child list of matrix
// exp. idx = 2
// Returns [2, 1]
var list = [];
for (var idx = 0; idx < matrix.length; idx++) {
list.push(matrix[idx][subIdx]);
}
return list;
};
// ===test===
var testMatrix = [
[0, 0, 1, 0, 0, 1],
[1, 0, 0, 0, 1, 1],
[0, 1, 1, 1, -1, 0],
[0, 0, 0, 1, 1, 1],
[1, 1, 1, -1, 0, 0],
[0, 1, -1, -1, -1, 0],
[1, 0, -1, 1, -1, 0],
[1, 0, 1, -1, 1, 0],
[0, 1, 0, 0, -1, 0],
[0, 1, -1, -1, 0, 0],
[1, 1, 0, 1, 1, 1],
[1, 1, 1, 0, 0, 1],
[0, 0, 1, -1, -1, 0],
[1, 1, 0, 0, 0, 1]
];
var featureLables = ['年龄', '身高', '长相', '收入', '家庭条件', '是否见面'],
minCIE = minGini = 1, tmpCIE, tmpGini, rootIdx,
ie = getListIE(getListFromMatrix(testMatrix, testMatrix[0].length - 1));
console.log('ie:' + ie);
for (var idx = 0; idx < featureLables.length - 1; idx ++) {
tmpCIE = getCIEForLastFeature(testMatrix, idx);
console.log('feature:' + featureLables[idx] + ' cie:' + tmpCIE + ' ieOffset:' + (ie - tmpCIE));
if (tmpCIE < minCIE) {
minCIE = tmpCIE;
rootIdx = idx;
}
}
console.log('base on ieOffset root feature:' + featureLables[rootIdx]);
for (var idx = 0; idx < featureLables.length - 1; idx ++) {
tmpGini = getCGiniForLastFeature(testMatrix, idx);
console.log('feature:' + featureLables[idx] + ' gini:' + tmpGini);
if (tmpGini < minGini) {
minGini = tmpGini;
rootIdx = idx;
}
}
console.log('base on gini root feature:' + featureLables[rootIdx]);
// 贝叶斯分类
/*
>贝叶斯定理,P(Ci|X)= P(X|Ci)*P(Ci),X被分类为Ci的概率表示为P(Ci|X),即X中出现Ci的概率,称为后验概率(还未发生的事件概率)
>P(X|Ci)是以Ci过滤出的数据集中X出现的概率,称为先验概率(已经发生的事件概率),P(Ci)是Ci在所有数据集中出现的概率
>贝叶斯分类,当有数据D,特征值F有n种
>则考察Fi时,它的先验概率为P(Fi|Ci),即以Ci过滤出的数据中Fi出现的概率,综合考察所有特征时有先验概率P(D|Ci)=P(F1|Ci)*P(F2|Ci)***P(Fn|Ci)
>D分类为Ci的后验概率为P(Ci|D),根据贝叶斯定理,P(Ci|D)=P(D|Ci)*P(Ci)
*/
var getPOfList = function (list) {
// `list` must be like this:
// [1, 1, 2, 1, 2]
// Returns each type p value as a object
var valueCount = {}, pValueCount = {};
for (var idx = 0; idx < list.length; idx ++) {
if (valueCount[list[idx]]) {
valueCount[list[idx]] += 1;
} else {
valueCount[list[idx]] = 1;
}
}
for (var value in valueCount) {
pValueCount[value] = valueCount[value] / list.length;
}
return pValueCount;
};
var getFeaturePOfMatrix = function (matrix, subIdx, value) {
// `matrix` must be like this:
// [
// [1, 1],
// [2, 2]
// ]
// `subIdx` is the idx of feature
// `value` specifield the feature type of you want computed P
// Returns P of value appear on this matrix
var list = getListFromMatrix(matrix, subIdx);
return getPOfList(list)[value];
};
var typeAValue = 1, typeBValue = 0,
typeASubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeAValue),
typeBSubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeBValue),
data = [0, 0, 0, 0, 1, null],
testMatrixP = getPOfList(getListFromMatrix(testMatrix, testMatrix[0].length - 1)),
pDataOfTypeA = pDataOfTypeB = 1, pTypeAOfData, pTypeBOfData;
for (var idx = 0; idx < data.length - 1; idx ++) {
pDataOfTypeA *= getFeaturePOfMatrix(typeASubMatrix, idx, data[idx]);
pDataOfTypeB *= getFeaturePOfMatrix(typeBSubMatrix, idx, data[idx]);
}
console.log('先验概率P(D|yes)=' + pDataOfTypeA);
console.log('先验概率P(D|no)=' + pDataOfTypeB);
console.log('后验概率P(yes|D)=' + (pTypeAOfData = pDataOfTypeA * testMatrixP[typeAValue]));
console.log('后验概率P(no|D)=' + (pTypeBOfData = pDataOfTypeB * testMatrixP[typeBValue]));
if (pDataOfTypeA > pDataOfTypeB) {
console.log('base on Bayes Data category to typeAValue: yes');
} else {
console.log('base on Bayes Data category to typeBValue: no');
}
@lovemyliwu
Copy link
Author

第1小题答案:
image

@lovemyliwu
Copy link
Author

第二小题答案:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment