-
-
Save Tedko/1bdb1dc7f8654c98453c42692862d200 to your computer and use it in GitHub Desktop.
Chinese sentence classification w/ Doc2vec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Title: Sentence classification w/ Doc2vec | |
# Author: FPSLuozi @Github | |
# Last updated: Aug 26, 2016 | |
# License: MIT | |
import jieba | |
import numpy as np | |
import gensim | |
from gensim.models.doc2vec import * | |
import keras | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout | |
from sklearn.svm import SVC, LinearSVC | |
from sklearn import neighbors | |
num_class = 18 | |
# 使用Jieba分词 | |
c = [] | |
with open("source") as f: | |
for line in f.readlines(): | |
c.append(line.split()) | |
dictionary = list() | |
for pair in c: | |
seg = list(jieba.cut(pair[1])) | |
dictionary.append([int(pair[0])-1, seg]) | |
# 生成分词后的PICC文本 | |
with open("source_seg_nolabel.txt", "w") as f: | |
for pair in dictionary: | |
f.write(" ".join(pair[1]).encode('utf8')) | |
f.write("\n") | |
# 生成Doc2Vec句向量模型 | |
documents = TaggedLineDocument("source_seg_nolabel.txt") | |
model = Doc2Vec(documents, size=100, window=8, min_count=1, workers=4) | |
model.save("picc_doc2vec.vec") # model.docvecs 包括了每一句对应的向量 | |
# 生成One-hot Label | |
num_line = len(model.docvecs) | |
y_label = [] | |
for i in range(num_line): | |
temp = [0] * num_class | |
index = dictionary[i][0] - 1 | |
temp[index] = 1 | |
y_label.append(temp) | |
# 生成训练数据(e.g. 前3000句) | |
train_num = 3000 | |
X_train = np.array(model.docvecs)[:train_num] | |
X_test = np.array(model.docvecs)[train_num+1:] | |
Y_train = y_label[:train_num] | |
Y_test = y_label[train_num+1:] | |
Y_actual, Y_text = zip(*dictionary) | |
Y_train_real = Y_actual[:train_num] | |
# 使用NN训练 | |
model_nn = Sequential() | |
model_nn.add(Dense(200, input_dim = 100)) | |
model_nn.add(Dense(50, input_dim = 200)) | |
model_nn.add(Dense(num_class, activation='softmax')) | |
model_nn.compile(optimizer='rmsprop', | |
loss='categorical_crossentropy', | |
metrics=['accuracy']) | |
hist = model_nn.fit(X_train, Y_train, nb_epoch=40, batch_size=30, verbose=0, validation_split=0.3) | |
model_nn.save_weights("model_nn.h5", overwrite=True) | |
print "NN Accuracy", hist.history["acc"][-1] | |
# 使用SVM训练 | |
clf = LinearSVC() | |
clf.fit(X_train, Y_train_real) | |
print "SVM Accuracy", clf.score(X_train, Y_train_real) | |
# 使用KNN训练 | |
clf2 = neighbors.KNeighborsClassifier(num_class, weights='distance') | |
# clf2 = neighbors.KNeighborsClassifier(num_class, weights='uniform') | |
clf2.fit(X_train, Y_train_real) | |
print "KNN distance weight Accuracy", clf2.score(X_train, Y_train_real) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment