Last active
April 9, 2016 05:06
-
-
Save lovemyliwu/13649606b497293daf24d8b60b267dfa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// 基尼指数 | |
/* | |
>基尼指数度量的是数据集的不纯度,变量x,取值有n种,每一种概率Pi,那么X的基尼指数Gini(x)=1-∑Pi^2 | |
>分类系统中类别就是变量,那么分类系统的基尼指数就是上面的Gini(x),Pi就是每种类别出现的概率 | |
>特征T有n种取值,当T取Ti的值时,分类系统的基尼指数表示为Gini(x|T=Ti)=1-∑Pi^2,Pi是使用Ti过滤出的分类子集中类别取值的概率 | |
>特征T在分类系统中总的基尼指数Gini(x|T)=Pi*Gini(x|T=Ti),Pi是Ti的取值概率 | |
*/ | |
var getGiniIndex = function (list) { | |
// `list` must be like this: | |
// [6, 8] or [1, 2, 3] | |
// the length means value types, exp. [1, 2] means there are 2 value types | |
// the first type appear 1 time, and the second type appear 2 times | |
// Returns gini index | |
var timesCount = 0, gini = 1, p; | |
for (var idx = 0; idx < list.length; idx++) { | |
timesCount += list[idx]; | |
} | |
for (var idx = 0; idx < list.length; idx++) { | |
p = list[idx] / timesCount; | |
gini -= p * p; | |
} | |
return gini; | |
}; | |
var getListGini = function(list) { | |
// `list` must be like this: | |
// [x1, x2, x1, x1, x2] | |
// Returns list gini index | |
var valueCount = {} | |
, timesList = []; | |
for (var idx = 0; idx < list.length; idx++) { | |
if (valueCount[list[idx]]) { | |
valueCount[list[idx]] += 1; | |
} else { | |
valueCount[list[idx]] = 1; | |
} | |
} | |
for (var value in valueCount) { | |
timesList.push(valueCount[value]); | |
} | |
return getGiniIndex(timesList); | |
}; | |
var getCGiniForLastFeature = function(matrix, subIdx) { | |
// `matrix` must be like this: | |
// [ | |
// [1, 1, 2, 3], | |
// [1, 1, 1, 3] | |
// ] | |
// `subIdx` is the feature idx | |
// Returns Conditional Gini Index on the `subIdx` feature | |
var cGini = 0, valueCount = {}, value, p, subMatrix, subList, gini; | |
for (var idx = 0; idx < matrix.length; idx++) { | |
value = matrix[idx][subIdx]; | |
if (valueCount[value]) { | |
valueCount[value] += 1; | |
} else { | |
valueCount[value] = 1; | |
} | |
} | |
for (value in valueCount) { | |
p = valueCount[value] / matrix.length; | |
subMatrix = getMatrixByConditional(matrix, subIdx, value); | |
subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1); | |
gini = getListGini(subList); | |
cGini += p * gini; | |
} | |
return cGini; | |
}; | |
// 信息增益值 | |
/* | |
>信息熵就是信息量,变量x,取值有n种,每一种概率Pi,那么X的熵就定义为:H(x)=-∑Pi*log2^Pi | |
>分类系统中类别就是变量,那么分类系统的信息熵就是上面的H(x),Pi就是每种类别出现的概率 | |
>特征T有n种取值,当T取Ti的值时,分类系统的信息熵表示为H(x|T=Ti),Ti出现的概率为Pi,则有条件熵H(x|T)=∑Pi*H(x|Ti) | |
>特征T的信息增益表示为IG(T)=H(x)-H(x|T) | |
*/ | |
var getInformationEntropy = function(list) { | |
// `list` must be like this: | |
// [6, 8] or [1, 2, 3] | |
// the length means value types, exp. [1, 2] means there are 2 value types | |
// the first type appear 1 time, and the second type appear 2 times | |
// Returns information entropy | |
var timesCount = 0, ie = 0, p; | |
for (var idx = 0; idx < list.length; idx++) { | |
timesCount += list[idx]; | |
} | |
for (var idx = 0; idx < list.length; idx++) { | |
p = list[idx] / timesCount; | |
ie -= p * Math.log2(p); | |
} | |
return ie; | |
}; | |
var getListIE = function(list) { | |
// `list` must be like this: | |
// [x1, x2, x1, x1, x2] | |
// Returns list Information Entropy | |
var valueCount = {} | |
, timesList = []; | |
for (var idx = 0; idx < list.length; idx++) { | |
if (valueCount[list[idx]]) { | |
valueCount[list[idx]] += 1; | |
} else { | |
valueCount[list[idx]] = 1; | |
} | |
} | |
for (var value in valueCount) { | |
timesList.push(valueCount[value]); | |
} | |
return getInformationEntropy(timesList); | |
}; | |
var getCIEForLastFeature = function(matrix, subIdx) { | |
// `matrix` must be like this: | |
// [ | |
// [1, 1, 2, 3], | |
// [1, 1, 1, 3] | |
// ] | |
// `subIdx` is the feature idx | |
// Returns Conditional Information Entropy on the `subIdx` feature | |
var cIE = 0, valueCount = {}, value, p, subMatrix, subList, ie; | |
for (var idx = 0; idx < matrix.length; idx++) { | |
value = matrix[idx][subIdx]; | |
if (valueCount[value]) { | |
valueCount[value] += 1; | |
} else { | |
valueCount[value] = 1; | |
} | |
} | |
for (value in valueCount) { | |
p = valueCount[value] / matrix.length; | |
subMatrix = getMatrixByConditional(matrix, subIdx, value); | |
subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1); | |
ie = getListIE(subList); | |
cIE += p * ie; | |
} | |
return cIE; | |
}; | |
// ===tools=== | |
var getMatrixByConditional = function(matrix, subIdx, value) { | |
// `matrix` must be like this: | |
// [[1, 1], [1, 0]] which means first feature only got 1 value type:1, | |
// and second feature got 2 value type: 1 or 0 | |
// `subIdx` must be a idx number to fingure out which feature you want to filtering | |
// `value` must be a valid type to you specifield feature | |
// Returns sub matrix filter by conditional | |
var subMatrix = []; | |
for (var idx = 0; idx < matrix.length; idx++) { | |
if (value == matrix[idx][subIdx]) { | |
subMatrix.push(matrix[idx]); | |
} | |
} | |
return subMatrix; | |
}; | |
var getListFromMatrix = function(matrix, subIdx) { | |
// `matrix` must be like this: | |
// [ | |
// [1, 1, 2, 3], | |
// [1, 1, 1, 3] | |
// ] | |
// `subIdx` is the number which you want to filtering out from each child list of matrix | |
// exp. idx = 2 | |
// Returns [2, 1] | |
var list = []; | |
for (var idx = 0; idx < matrix.length; idx++) { | |
list.push(matrix[idx][subIdx]); | |
} | |
return list; | |
}; | |
// ===test=== | |
var testMatrix = [ | |
[0, 0, 1, 0, 0, 1], | |
[1, 0, 0, 0, 1, 1], | |
[0, 1, 1, 1, -1, 0], | |
[0, 0, 0, 1, 1, 1], | |
[1, 1, 1, -1, 0, 0], | |
[0, 1, -1, -1, -1, 0], | |
[1, 0, -1, 1, -1, 0], | |
[1, 0, 1, -1, 1, 0], | |
[0, 1, 0, 0, -1, 0], | |
[0, 1, -1, -1, 0, 0], | |
[1, 1, 0, 1, 1, 1], | |
[1, 1, 1, 0, 0, 1], | |
[0, 0, 1, -1, -1, 0], | |
[1, 1, 0, 0, 0, 1] | |
]; | |
var featureLables = ['年龄', '身高', '长相', '收入', '家庭条件', '是否见面'], | |
minCIE = minGini = 1, tmpCIE, tmpGini, rootIdx, | |
ie = getListIE(getListFromMatrix(testMatrix, testMatrix[0].length - 1)); | |
console.log('ie:' + ie); | |
for (var idx = 0; idx < featureLables.length - 1; idx ++) { | |
tmpCIE = getCIEForLastFeature(testMatrix, idx); | |
console.log('feature:' + featureLables[idx] + ' cie:' + tmpCIE + ' ieOffset:' + (ie - tmpCIE)); | |
if (tmpCIE < minCIE) { | |
minCIE = tmpCIE; | |
rootIdx = idx; | |
} | |
} | |
console.log('base on ieOffset root feature:' + featureLables[rootIdx]); | |
for (var idx = 0; idx < featureLables.length - 1; idx ++) { | |
tmpGini = getCGiniForLastFeature(testMatrix, idx); | |
console.log('feature:' + featureLables[idx] + ' gini:' + tmpGini); | |
if (tmpGini < minGini) { | |
minGini = tmpGini; | |
rootIdx = idx; | |
} | |
} | |
console.log('base on gini root feature:' + featureLables[rootIdx]); | |
// 贝叶斯分类 | |
/* | |
>贝叶斯定理,P(Ci|X)= P(X|Ci)*P(Ci),X被分类为Ci的概率表示为P(Ci|X),即X中出现Ci的概率,称为后验概率(还未发生的事件概率) | |
>P(X|Ci)是以Ci过滤出的数据集中X出现的概率,称为先验概率(已经发生的事件概率),P(Ci)是Ci在所有数据集中出现的概率 | |
>贝叶斯分类,当有数据D,特征值F有n种 | |
>则考察Fi时,它的先验概率为P(Fi|Ci),即以Ci过滤出的数据中Fi出现的概率,综合考察所有特征时有先验概率P(D|Ci)=P(F1|Ci)*P(F2|Ci)***P(Fn|Ci) | |
>D分类为Ci的后验概率为P(Ci|D),根据贝叶斯定理,P(Ci|D)=P(D|Ci)*P(Ci) | |
*/ | |
var getPOfList = function (list) { | |
// `list` must be like this: | |
// [1, 1, 2, 1, 2] | |
// Returns each type p value as a object | |
var valueCount = {}, pValueCount = {}; | |
for (var idx = 0; idx < list.length; idx ++) { | |
if (valueCount[list[idx]]) { | |
valueCount[list[idx]] += 1; | |
} else { | |
valueCount[list[idx]] = 1; | |
} | |
} | |
for (var value in valueCount) { | |
pValueCount[value] = valueCount[value] / list.length; | |
} | |
return pValueCount; | |
}; | |
var getFeaturePOfMatrix = function (matrix, subIdx, value) { | |
// `matrix` must be like this: | |
// [ | |
// [1, 1], | |
// [2, 2] | |
// ] | |
// `subIdx` is the idx of feature | |
// `value` specifield the feature type of you want computed P | |
// Returns P of value appear on this matrix | |
var list = getListFromMatrix(matrix, subIdx); | |
return getPOfList(list)[value]; | |
}; | |
var typeAValue = 1, typeBValue = 0, | |
typeASubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeAValue), | |
typeBSubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeBValue), | |
data = [0, 0, 0, 0, 1, null], | |
testMatrixP = getPOfList(getListFromMatrix(testMatrix, testMatrix[0].length - 1)), | |
pDataOfTypeA = pDataOfTypeB = 1, pTypeAOfData, pTypeBOfData; | |
for (var idx = 0; idx < data.length - 1; idx ++) { | |
pDataOfTypeA *= getFeaturePOfMatrix(typeASubMatrix, idx, data[idx]); | |
pDataOfTypeB *= getFeaturePOfMatrix(typeBSubMatrix, idx, data[idx]); | |
} | |
console.log('先验概率P(D|yes)=' + pDataOfTypeA); | |
console.log('先验概率P(D|no)=' + pDataOfTypeB); | |
console.log('后验概率P(yes|D)=' + (pTypeAOfData = pDataOfTypeA * testMatrixP[typeAValue])); | |
console.log('后验概率P(no|D)=' + (pTypeBOfData = pDataOfTypeB * testMatrixP[typeBValue])); | |
if (pDataOfTypeA > pDataOfTypeB) { | |
console.log('base on Bayes Data category to typeAValue: yes'); | |
} else { | |
console.log('base on Bayes Data category to typeBValue: no'); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
第二小题答案: