lovemyliwu/数据挖掘及应用测验（二）代码版答案.js

## 数据挖掘及应用测验（二）代码版答案.js
// 基尼指数

/*
>基尼指数度量的是数据集的不纯度，变量x，取值有n种，每一种概率Pi，那么X的基尼指数Gini(x)=1-∑Pi^2
>分类系统中类别就是变量，那么分类系统的基尼指数就是上面的Gini(x)，Pi就是每种类别出现的概率
>特征T有n种取值，当T取Ti的值时，分类系统的基尼指数表示为Gini(x|T=Ti)=1-∑Pi^2，Pi是使用Ti过滤出的分类子集中类别取值的概率
>特征T在分类系统中总的基尼指数Gini(x|T)=Pi*Gini(x|T=Ti)，Pi是Ti的取值概率
*/

var getGiniIndex = function (list) {
    // `list` must be like this:
    // [6, 8] or [1, 2, 3]
    // the length means value types, exp. [1, 2] means there are 2 value types
    // the first type appear 1 time, and the second type appear 2 times
    // Returns gini index
    var timesCount = 0, gini = 1, p;
    for (var idx = 0; idx < list.length; idx++) {
        timesCount += list[idx];
    }
    for (var idx = 0; idx < list.length; idx++) {
        p = list[idx] / timesCount;
        gini -= p * p;
    }
    return gini;
};

var getListGini = function(list) {
    // `list` must be like this:
    // [x1, x2, x1, x1, x2]
    // Returns list gini index
    var valueCount = {}

    , timesList = [];

    for (var idx = 0; idx < list.length; idx++) {
        if (valueCount[list[idx]]) {
            valueCount[list[idx]] += 1;
        } else {
            valueCount[list[idx]] = 1;
        }
    }

    for (var value in valueCount) {
        timesList.push(valueCount[value]);
    }

    return getGiniIndex(timesList);
};

var getCGiniForLastFeature = function(matrix, subIdx) {
    // `matrix` must be like this:
    // [
    //  [1, 1, 2, 3],
    //  [1, 1, 1, 3]
    // ]
    // `subIdx` is the feature idx
    // Returns Conditional Gini Index on the `subIdx` feature
    var cGini = 0, valueCount = {}, value, p, subMatrix, subList, gini;
    for (var idx = 0; idx < matrix.length; idx++) {
        value = matrix[idx][subIdx];
        if (valueCount[value]) {
            valueCount[value] += 1;
        } else {
            valueCount[value] = 1;
        }
    }
    for (value in valueCount) {
        p = valueCount[value] / matrix.length;
        subMatrix = getMatrixByConditional(matrix, subIdx, value);
        subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1);
        gini = getListGini(subList);
        cGini += p * gini;
    }
    return cGini;
};

// 信息增益值

/*
>信息熵就是信息量，变量x，取值有n种，每一种概率Pi，那么X的熵就定义为：H(x)=-∑Pi*log2^Pi
>分类系统中类别就是变量，那么分类系统的信息熵就是上面的H(x)，Pi就是每种类别出现的概率
>特征T有n种取值，当T取Ti的值时，分类系统的信息熵表示为H(x|T=Ti)，Ti出现的概率为Pi，则有条件熵H(x|T)=∑Pi*H(x|Ti)
>特征T的信息增益表示为IG(T)=H(x)-H(x|T)
*/

var getInformationEntropy = function(list) {
    // `list` must be like this:
    // [6, 8] or [1, 2, 3]
    // the length means value types, exp. [1, 2] means there are 2 value types
    // the first type appear 1 time, and the second type appear 2 times
    // Returns information entropy
    var timesCount = 0, ie = 0, p;
    for (var idx = 0; idx < list.length; idx++) {
        timesCount += list[idx];
    }
    for (var idx = 0; idx < list.length; idx++) {
        p = list[idx] / timesCount;
        ie -= p * Math.log2(p);
    }
    return ie;
};

var getListIE = function(list) {
    // `list` must be like this:
    // [x1, x2, x1, x1, x2]
    // Returns list Information Entropy
    var valueCount = {}

    , timesList = [];

    for (var idx = 0; idx < list.length; idx++) {
        if (valueCount[list[idx]]) {
            valueCount[list[idx]] += 1;
        } else {
            valueCount[list[idx]] = 1;
        }
    }

    for (var value in valueCount) {
        timesList.push(valueCount[value]);
    }

    return getInformationEntropy(timesList);
};

var getCIEForLastFeature = function(matrix, subIdx) {
    // `matrix` must be like this:
    // [
    //  [1, 1, 2, 3],
    //  [1, 1, 1, 3]
    // ]
    // `subIdx` is the feature idx
    // Returns Conditional Information Entropy on the `subIdx` feature
    var cIE = 0, valueCount = {}, value, p, subMatrix, subList, ie;
    for (var idx = 0; idx < matrix.length; idx++) {
        value = matrix[idx][subIdx];
        if (valueCount[value]) {
            valueCount[value] += 1;
        } else {
            valueCount[value] = 1;
        }
    }
    for (value in valueCount) {
        p = valueCount[value] / matrix.length;
        subMatrix = getMatrixByConditional(matrix, subIdx, value);
        subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1);
        ie = getListIE(subList);
        cIE += p * ie;
    }
    return cIE;
};

// ===tools===

var getMatrixByConditional = function(matrix, subIdx, value) {
    // `matrix` must be like this:
    // [[1, 1], [1, 0]] which means first feature only got 1 value type:1,
    // and second feature got 2 value type: 1 or 0
    // `subIdx` must be a idx number to fingure out which feature you want to filtering
    // `value` must be a valid type to you specifield feature
    // Returns sub matrix filter by conditional
    var subMatrix = [];
    for (var idx = 0; idx < matrix.length; idx++) {
        if (value == matrix[idx][subIdx]) {
            subMatrix.push(matrix[idx]);
        }
    }
    return subMatrix;
};

var getListFromMatrix = function(matrix, subIdx) {
    // `matrix` must be like this:
    // [
    //  [1, 1, 2, 3],
    //  [1, 1, 1, 3]
    // ]
    // `subIdx` is the number which you want to filtering out from each child list of matrix
    // exp. idx = 2
    // Returns [2, 1]
    var list = [];
    for (var idx = 0; idx < matrix.length; idx++) {
        list.push(matrix[idx][subIdx]);
    }
    return list;
};

// ===test===

var testMatrix = [
[0, 0, 1, 0, 0, 1],
[1, 0, 0, 0, 1, 1],
[0, 1, 1, 1, -1, 0],
[0, 0, 0, 1, 1, 1],
[1, 1, 1, -1, 0, 0],
[0, 1, -1, -1, -1, 0],
[1, 0, -1, 1, -1, 0],
[1, 0, 1, -1, 1, 0],
[0, 1, 0, 0, -1, 0],
[0, 1, -1, -1, 0, 0],
[1, 1, 0, 1, 1, 1],
[1, 1, 1, 0, 0, 1],
[0, 0, 1, -1, -1, 0],
[1, 1, 0, 0, 0, 1]
];

var featureLables = ['年龄', '身高', '长相', '收入', '家庭条件', '是否见面'],
    minCIE = minGini = 1, tmpCIE, tmpGini, rootIdx,
    ie = getListIE(getListFromMatrix(testMatrix, testMatrix[0].length - 1));
console.log('ie:' + ie);
for (var idx = 0; idx < featureLables.length - 1; idx ++) {
    tmpCIE = getCIEForLastFeature(testMatrix, idx);
    console.log('feature:' + featureLables[idx] + ' cie:' + tmpCIE + ' ieOffset:' + (ie - tmpCIE));
    if (tmpCIE < minCIE) {
        minCIE = tmpCIE;
        rootIdx = idx;
    }
}
console.log('base on ieOffset root feature:' + featureLables[rootIdx]);
for (var idx = 0; idx < featureLables.length - 1; idx ++) {
    tmpGini = getCGiniForLastFeature(testMatrix, idx);
    console.log('feature:' + featureLables[idx] + ' gini:' + tmpGini);
    if (tmpGini < minGini) {
        minGini = tmpGini;
        rootIdx = idx;
    }
}
console.log('base on gini root feature:' + featureLables[rootIdx]);

// 贝叶斯分类

/*
>贝叶斯定理，P(Ci|X)= P(X|Ci)*P(Ci)，X被分类为Ci的概率表示为P(Ci|X)，即X中出现Ci的概率，称为后验概率（还未发生的事件概率）
>P(X|Ci)是以Ci过滤出的数据集中X出现的概率，称为先验概率（已经发生的事件概率），P(Ci)是Ci在所有数据集中出现的概率
>贝叶斯分类，当有数据D，特征值F有n种
>则考察Fi时，它的先验概率为P(Fi|Ci)，即以Ci过滤出的数据中Fi出现的概率，综合考察所有特征时有先验概率P(D|Ci)=P(F1|Ci)*P(F2|Ci)***P(Fn|Ci)
>D分类为Ci的后验概率为P(Ci|D)，根据贝叶斯定理，P(Ci|D)=P(D|Ci)*P(Ci)
*/

var getPOfList = function (list) {
    // `list` must be like this:
    // [1, 1, 2, 1, 2]
    // Returns each type p value as a object
    var valueCount = {}, pValueCount = {};
    for (var idx = 0; idx < list.length; idx ++) {
        if (valueCount[list[idx]]) {
            valueCount[list[idx]] += 1;
        } else {
            valueCount[list[idx]] = 1;
        }
    }
    for (var value in valueCount) {
        pValueCount[value] = valueCount[value] / list.length;
    }
    return pValueCount;
};

var getFeaturePOfMatrix = function (matrix, subIdx, value) {
    // `matrix` must be like this:
    // [
    //    [1, 1],
    //    [2, 2]
    //  ]
    // `subIdx` is the idx of feature
    // `value` specifield the feature type of you want computed P
    // Returns P of value appear on this matrix
    var list = getListFromMatrix(matrix, subIdx);
    return getPOfList(list)[value];
};

var typeAValue = 1, typeBValue = 0,
    typeASubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeAValue),
    typeBSubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeBValue),
    data = [0, 0, 0, 0, 1, null],
    testMatrixP = getPOfList(getListFromMatrix(testMatrix, testMatrix[0].length - 1)),
    pDataOfTypeA = pDataOfTypeB = 1, pTypeAOfData, pTypeBOfData;

for (var idx = 0; idx < data.length - 1; idx ++) {
    pDataOfTypeA *= getFeaturePOfMatrix(typeASubMatrix, idx, data[idx]);
    pDataOfTypeB *= getFeaturePOfMatrix(typeBSubMatrix, idx, data[idx]);
}

console.log('先验概率P(D|yes)=' + pDataOfTypeA);
console.log('先验概率P(D|no)=' + pDataOfTypeB);
console.log('后验概率P(yes|D)=' + (pTypeAOfData = pDataOfTypeA * testMatrixP[typeAValue]));
console.log('后验概率P(no|D)=' + (pTypeBOfData = pDataOfTypeB * testMatrixP[typeBValue]));
if (pDataOfTypeA > pDataOfTypeB) {
    console.log('base on Bayes Data category to typeAValue: yes');
} else {
    console.log('base on Bayes Data category to typeBValue: no');
}
	// 基尼指数

	/*
	>基尼指数度量的是数据集的不纯度，变量x，取值有n种，每一种概率Pi，那么X的基尼指数Gini(x)=1-∑Pi^2
	>分类系统中类别就是变量，那么分类系统的基尼指数就是上面的Gini(x)，Pi就是每种类别出现的概率
	>特征T有n种取值，当T取Ti的值时，分类系统的基尼指数表示为Gini(x\|T=Ti)=1-∑Pi^2，Pi是使用Ti过滤出的分类子集中类别取值的概率
	>特征T在分类系统中总的基尼指数Gini(x\|T)=Pi*Gini(x\|T=Ti)，Pi是Ti的取值概率
	*/

	var getGiniIndex = function (list) {
	// `list` must be like this:
	// [6, 8] or [1, 2, 3]
	// the length means value types, exp. [1, 2] means there are 2 value types
	// the first type appear 1 time, and the second type appear 2 times
	// Returns gini index
	var timesCount = 0, gini = 1, p;
	for (var idx = 0; idx < list.length; idx++) {
	timesCount += list[idx];
	}
	for (var idx = 0; idx < list.length; idx++) {
	p = list[idx] / timesCount;
	gini -= p * p;
	}
	return gini;
	};

	var getListGini = function(list) {
	// `list` must be like this:
	// [x1, x2, x1, x1, x2]
	// Returns list gini index
	var valueCount = {}

	, timesList = [];

	for (var idx = 0; idx < list.length; idx++) {
	if (valueCount[list[idx]]) {
	valueCount[list[idx]] += 1;
	} else {
	valueCount[list[idx]] = 1;
	}
	}

	for (var value in valueCount) {
	timesList.push(valueCount[value]);
	}

	return getGiniIndex(timesList);
	};

	var getCGiniForLastFeature = function(matrix, subIdx) {
	// `matrix` must be like this:
	// [
	// [1, 1, 2, 3],
	// [1, 1, 1, 3]
	// ]
	// `subIdx` is the feature idx
	// Returns Conditional Gini Index on the `subIdx` feature
	var cGini = 0, valueCount = {}, value, p, subMatrix, subList, gini;
	for (var idx = 0; idx < matrix.length; idx++) {
	value = matrix[idx][subIdx];
	if (valueCount[value]) {
	valueCount[value] += 1;
	} else {
	valueCount[value] = 1;
	}
	}
	for (value in valueCount) {
	p = valueCount[value] / matrix.length;
	subMatrix = getMatrixByConditional(matrix, subIdx, value);
	subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1);
	gini = getListGini(subList);
	cGini += p * gini;
	}
	return cGini;
	};

	// 信息增益值

	/*
	>信息熵就是信息量，变量x，取值有n种，每一种概率Pi，那么X的熵就定义为：H(x)=-∑Pi*log2^Pi
	>分类系统中类别就是变量，那么分类系统的信息熵就是上面的H(x)，Pi就是每种类别出现的概率
	>特征T有n种取值，当T取Ti的值时，分类系统的信息熵表示为H(x\|T=Ti)，Ti出现的概率为Pi，则有条件熵H(x\|T)=∑Pi*H(x\|Ti)
	>特征T的信息增益表示为IG(T)=H(x)-H(x\|T)
	*/

	var getInformationEntropy = function(list) {
	// `list` must be like this:
	// [6, 8] or [1, 2, 3]
	// the length means value types, exp. [1, 2] means there are 2 value types
	// the first type appear 1 time, and the second type appear 2 times
	// Returns information entropy
	var timesCount = 0, ie = 0, p;
	for (var idx = 0; idx < list.length; idx++) {
	timesCount += list[idx];
	}
	for (var idx = 0; idx < list.length; idx++) {
	p = list[idx] / timesCount;
	ie -= p * Math.log2(p);
	}
	return ie;
	};

	var getListIE = function(list) {
	// `list` must be like this:
	// [x1, x2, x1, x1, x2]
	// Returns list Information Entropy
	var valueCount = {}

	, timesList = [];

	for (var idx = 0; idx < list.length; idx++) {
	if (valueCount[list[idx]]) {
	valueCount[list[idx]] += 1;
	} else {
	valueCount[list[idx]] = 1;
	}
	}

	for (var value in valueCount) {
	timesList.push(valueCount[value]);
	}

	return getInformationEntropy(timesList);
	};

	var getCIEForLastFeature = function(matrix, subIdx) {
	// `matrix` must be like this:
	// [
	// [1, 1, 2, 3],
	// [1, 1, 1, 3]
	// ]
	// `subIdx` is the feature idx
	// Returns Conditional Information Entropy on the `subIdx` feature
	var cIE = 0, valueCount = {}, value, p, subMatrix, subList, ie;
	for (var idx = 0; idx < matrix.length; idx++) {
	value = matrix[idx][subIdx];
	if (valueCount[value]) {
	valueCount[value] += 1;
	} else {
	valueCount[value] = 1;
	}
	}
	for (value in valueCount) {
	p = valueCount[value] / matrix.length;
	subMatrix = getMatrixByConditional(matrix, subIdx, value);
	subList = getListFromMatrix(subMatrix, subMatrix[0].length - 1);
	ie = getListIE(subList);
	cIE += p * ie;
	}
	return cIE;
	};

	// ===tools===

	var getMatrixByConditional = function(matrix, subIdx, value) {
	// `matrix` must be like this:
	// [[1, 1], [1, 0]] which means first feature only got 1 value type:1,
	// and second feature got 2 value type: 1 or 0
	// `subIdx` must be a idx number to fingure out which feature you want to filtering
	// `value` must be a valid type to you specifield feature
	// Returns sub matrix filter by conditional
	var subMatrix = [];
	for (var idx = 0; idx < matrix.length; idx++) {
	if (value == matrix[idx][subIdx]) {
	subMatrix.push(matrix[idx]);
	}
	}
	return subMatrix;
	};

	var getListFromMatrix = function(matrix, subIdx) {
	// `matrix` must be like this:
	// [
	// [1, 1, 2, 3],
	// [1, 1, 1, 3]
	// ]
	// `subIdx` is the number which you want to filtering out from each child list of matrix
	// exp. idx = 2
	// Returns [2, 1]
	var list = [];
	for (var idx = 0; idx < matrix.length; idx++) {
	list.push(matrix[idx][subIdx]);
	}
	return list;
	};

	// ===test===

	var testMatrix = [
	[0, 0, 1, 0, 0, 1],
	[1, 0, 0, 0, 1, 1],
	[0, 1, 1, 1, -1, 0],
	[0, 0, 0, 1, 1, 1],
	[1, 1, 1, -1, 0, 0],
	[0, 1, -1, -1, -1, 0],
	[1, 0, -1, 1, -1, 0],
	[1, 0, 1, -1, 1, 0],
	[0, 1, 0, 0, -1, 0],
	[0, 1, -1, -1, 0, 0],
	[1, 1, 0, 1, 1, 1],
	[1, 1, 1, 0, 0, 1],
	[0, 0, 1, -1, -1, 0],
	[1, 1, 0, 0, 0, 1]
	];

	var featureLables = ['年龄', '身高', '长相', '收入', '家庭条件', '是否见面'],
	minCIE = minGini = 1, tmpCIE, tmpGini, rootIdx,
	ie = getListIE(getListFromMatrix(testMatrix, testMatrix[0].length - 1));
	console.log('ie:' + ie);
	for (var idx = 0; idx < featureLables.length - 1; idx ++) {
	tmpCIE = getCIEForLastFeature(testMatrix, idx);
	console.log('feature:' + featureLables[idx] + ' cie:' + tmpCIE + ' ieOffset:' + (ie - tmpCIE));
	if (tmpCIE < minCIE) {
	minCIE = tmpCIE;
	rootIdx = idx;
	}
	}
	console.log('base on ieOffset root feature:' + featureLables[rootIdx]);
	for (var idx = 0; idx < featureLables.length - 1; idx ++) {
	tmpGini = getCGiniForLastFeature(testMatrix, idx);
	console.log('feature:' + featureLables[idx] + ' gini:' + tmpGini);
	if (tmpGini < minGini) {
	minGini = tmpGini;
	rootIdx = idx;
	}
	}
	console.log('base on gini root feature:' + featureLables[rootIdx]);

	// 贝叶斯分类

	/*
	>贝叶斯定理，P(Ci\|X)= P(X\|Ci)*P(Ci)，X被分类为Ci的概率表示为P(Ci\|X)，即X中出现Ci的概率，称为后验概率（还未发生的事件概率）
	>P(X\|Ci)是以Ci过滤出的数据集中X出现的概率，称为先验概率（已经发生的事件概率），P(Ci)是Ci在所有数据集中出现的概率
	>贝叶斯分类，当有数据D，特征值F有n种
	>则考察Fi时，它的先验概率为P(Fi\|Ci)，即以Ci过滤出的数据中Fi出现的概率，综合考察所有特征时有先验概率P(D\|Ci)=P(F1\|Ci)P(F2\|Ci)**P(Fn\|Ci)
	>D分类为Ci的后验概率为P(Ci\|D)，根据贝叶斯定理，P(Ci\|D)=P(D\|Ci)*P(Ci)
	*/

	var getPOfList = function (list) {
	// `list` must be like this:
	// [1, 1, 2, 1, 2]
	// Returns each type p value as a object
	var valueCount = {}, pValueCount = {};
	for (var idx = 0; idx < list.length; idx ++) {
	if (valueCount[list[idx]]) {
	valueCount[list[idx]] += 1;
	} else {
	valueCount[list[idx]] = 1;
	}
	}
	for (var value in valueCount) {
	pValueCount[value] = valueCount[value] / list.length;
	}
	return pValueCount;
	};

	var getFeaturePOfMatrix = function (matrix, subIdx, value) {
	// `matrix` must be like this:
	// [
	// [1, 1],
	// [2, 2]
	// ]
	// `subIdx` is the idx of feature
	// `value` specifield the feature type of you want computed P
	// Returns P of value appear on this matrix
	var list = getListFromMatrix(matrix, subIdx);
	return getPOfList(list)[value];
	};

	var typeAValue = 1, typeBValue = 0,
	typeASubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeAValue),
	typeBSubMatrix = getMatrixByConditional(testMatrix, testMatrix[0].length - 1, typeBValue),
	data = [0, 0, 0, 0, 1, null],
	testMatrixP = getPOfList(getListFromMatrix(testMatrix, testMatrix[0].length - 1)),
	pDataOfTypeA = pDataOfTypeB = 1, pTypeAOfData, pTypeBOfData;

	for (var idx = 0; idx < data.length - 1; idx ++) {
	pDataOfTypeA *= getFeaturePOfMatrix(typeASubMatrix, idx, data[idx]);
	pDataOfTypeB *= getFeaturePOfMatrix(typeBSubMatrix, idx, data[idx]);
	}

	console.log('先验概率P(D\|yes)=' + pDataOfTypeA);
	console.log('先验概率P(D\|no)=' + pDataOfTypeB);
	console.log('后验概率P(yes\|D)=' + (pTypeAOfData = pDataOfTypeA * testMatrixP[typeAValue]));
	console.log('后验概率P(no\|D)=' + (pTypeBOfData = pDataOfTypeB * testMatrixP[typeBValue]));
	if (pDataOfTypeA > pDataOfTypeB) {
	console.log('base on Bayes Data category to typeAValue: yes');
	} else {
	console.log('base on Bayes Data category to typeBValue: no');
	}