Skip to content

Instantly share code, notes, and snippets.

@Tedko
Forked from fpsluozi/clf.py
Created August 30, 2016 05:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tedko/1bdb1dc7f8654c98453c42692862d200 to your computer and use it in GitHub Desktop.
Save Tedko/1bdb1dc7f8654c98453c42692862d200 to your computer and use it in GitHub Desktop.
Chinese sentence classification w/ Doc2vec
# Title: Sentence classification w/ Doc2vec
# Author: FPSLuozi @Github
# Last updated: Aug 26, 2016
# License: MIT
import jieba
import numpy as np
import gensim
from gensim.models.doc2vec import *
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.svm import SVC, LinearSVC
from sklearn import neighbors
num_class = 18
# 使用Jieba分词
c = []
with open("source") as f:
for line in f.readlines():
c.append(line.split())
dictionary = list()
for pair in c:
seg = list(jieba.cut(pair[1]))
dictionary.append([int(pair[0])-1, seg])
# 生成分词后的PICC文本
with open("source_seg_nolabel.txt", "w") as f:
for pair in dictionary:
f.write(" ".join(pair[1]).encode('utf8'))
f.write("\n")
# 生成Doc2Vec句向量模型
documents = TaggedLineDocument("source_seg_nolabel.txt")
model = Doc2Vec(documents, size=100, window=8, min_count=1, workers=4)
model.save("picc_doc2vec.vec") # model.docvecs 包括了每一句对应的向量
# 生成One-hot Label
num_line = len(model.docvecs)
y_label = []
for i in range(num_line):
temp = [0] * num_class
index = dictionary[i][0] - 1
temp[index] = 1
y_label.append(temp)
# 生成训练数据(e.g. 前3000句)
train_num = 3000
X_train = np.array(model.docvecs)[:train_num]
X_test = np.array(model.docvecs)[train_num+1:]
Y_train = y_label[:train_num]
Y_test = y_label[train_num+1:]
Y_actual, Y_text = zip(*dictionary)
Y_train_real = Y_actual[:train_num]
# 使用NN训练
model_nn = Sequential()
model_nn.add(Dense(200, input_dim = 100))
model_nn.add(Dense(50, input_dim = 200))
model_nn.add(Dense(num_class, activation='softmax'))
model_nn.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
hist = model_nn.fit(X_train, Y_train, nb_epoch=40, batch_size=30, verbose=0, validation_split=0.3)
model_nn.save_weights("model_nn.h5", overwrite=True)
print "NN Accuracy", hist.history["acc"][-1]
# 使用SVM训练
clf = LinearSVC()
clf.fit(X_train, Y_train_real)
print "SVM Accuracy", clf.score(X_train, Y_train_real)
# 使用KNN训练
clf2 = neighbors.KNeighborsClassifier(num_class, weights='distance')
# clf2 = neighbors.KNeighborsClassifier(num_class, weights='uniform')
clf2.fit(X_train, Y_train_real)
print "KNN distance weight Accuracy", clf2.score(X_train, Y_train_real)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment