Created
March 25, 2014 06:00
-
-
Save Azure-rong/9755977 to your computer and use it in GitHub Desktop.
Prediction examination:Predicting review helpfulness by using different classifier and find the best classifer. Then check feature subset prediction performance to find the most influence feature of review helpfulness
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python2.7 | |
#coding=utf-8 | |
""" | |
Use scikit-learn to test different classifier's review helpfulness prediction performance, and test different feature subset's prediction performance | |
This module is the last part of review helpfulness prediction research. | |
""" | |
import numpy as np | |
from random import shuffle | |
from sklearn import svm | |
from sklearn.linear_model import LogisticRegression | |
from sklearn import tree | |
from sklearn.naive_bayes import GaussianNB, BernoulliNB | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier | |
from sklearn import cross_validation | |
from sklearn.metrics import f1_score, precision_score, recall_score | |
# 1. Load data | |
def read_data(datapath): | |
f = open(datapath) | |
f.readline() | |
data = np.loadtxt(f) | |
return data | |
data = read_data("D:/code/machine learning/feature.txt") | |
shuffle(data) # Make data ramdon | |
helpfulness_target = data[:, 0 ] # First column of the dataset is review helpfulness label | |
helpfulness_feature = data[:, 1:] # The rest of the dataset is review helpfulness features | |
# 2. Feature subset | |
# linguistic = data[:, 4:10] | |
# informative = np.hstack((data[:, 1:4], data[:, 20:21])) | |
# difference = data[:, 10:12] | |
# sentiment = data[:, 12:20] | |
# IDS = np.hstack((data[:, 1:4], data[:, 10:21])) | |
# LIS = np.hstack((data[:, 1:10], data[:, 12:21])) | |
# LDS = data[:, 4:20] | |
# LID = np.hstack((data[:, 1:12], data[:, 20:21])) | |
# LI = np.hstack((data[:, 1:10], data[:, 20:21])) | |
# LD = data[:, 4:12] | |
# LS = np.hstack((data[:, 4:10], data[:, 12:20])) | |
# ID = np.hstack((data[:, 1:4], data[:, 10:12], data[:, 20:21])) | |
# IS = np.hstack((data[:, 1:4], data[:, 12:21])) | |
# DS = data[:, 10:20] | |
# L1 = data[:, 4:7] | |
# L2 = data[:, 7:10] | |
# S1 = data[:, 12:14] | |
# S2 = data[:, 14:16] | |
# S3 = data[:, 16:18] | |
# S4 = data[:, 18:20] | |
# Sentiment feature subset | |
# S12 = data[:, 12:16] | |
# S13 = np.hstack((data[:, 12:14], data[:, 16:18])) | |
# S14 = np.hstack((data[:, 12:14], data[:, 18:20])) | |
# S23 = data[:, 14:18] | |
# S24 = np.hstack((data[:, 14:16], data[:, 18:20])) | |
# S34 = data[:, 16:20] | |
# SP = np.hstack((data[:, 12:13], data[:, 14:15], data[:, 16:17], data[:, 18:19])) | |
# SN = np.hstack((data[:, 13:14], data[:, 15:16], data[:, 17:18], data[:, 19:20])) | |
# 3. Load classifier | |
# 3.1 Classifier for binary classifiy | |
clf = svm.SVC(gamma=0.001, C=100.) | |
# clf = svm.SVR() | |
# clf = LogisticRegression(penalty='l1', tol=0.01) | |
# clf = tree.DecisionTreeClassifier() | |
# clf = GaussianNB() | |
# clf = BernoulliNB() | |
# clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=1, random_state=0) | |
# 3.2 Classifier for mulit classify | |
# clf = OneVsOneClassifier(svm.SVC(gamma=0.001, C=100.)) | |
# clf = OneVsOneClassifier(svm.SVR()) | |
# clf = OneVsRestClassifier(LogisticRegression(penalty='l1', tol=0.01)) | |
# 4. Cross validate classifier's accuracy | |
k_fold = cross_validation.KFold(len(x), n_folds=10) | |
clf_accuracy = cross_validation.cross_val_score(clf, x, y, cv=k_fold) | |
print clf_accuracy.mean() | |
# 5. Cross validate for all metrics, include precision, recall and f1 measure (macro, micro) | |
def metric_evaluation(feature, target): | |
k_fold = cross_validation.KFold(len(feature), k=10) # 10-fold cross validation | |
metric = [] | |
for train, test in k_fold: | |
target_pred = clf.fit(feature[train], target[train]).predict(feature[test]) | |
p = precision_score(target[test], target_pred) | |
r = recall_score(target[test], target_pred) | |
f1_macro = f1_score(target[test], target_pred, average='macro') | |
f1_micro = f1_score(target[test], target_pred, average='micro') | |
metric.append([p,r,f1_macro,f1_micro]) | |
metric_array = np.array(metric) | |
print np.mean(metric_array[:, 0]) # Precision score | |
print np.mean(metric_array[:, 1]) # Recall score | |
print np.mean(metric_array[:, 2]) # F1-macro score | |
print np.mean(metric_array[:, 3]) # F1-micro score | |
# Testing | |
metric_evaluation(helpfulness_feature, helpfulness_target) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment