Skip to content

Instantly share code, notes, and snippets.

@ratsgo
Last active May 28, 2017 05:59
Show Gist options
  • Save ratsgo/7f3063413bd959e6c987612437e46977 to your computer and use it in GitHub Desktop.
Save ratsgo/7f3063413bd959e6c987612437e46977 to your computer and use it in GitHub Desktop.
import os
import time
import datetime
from tensorflow import flags
import tensorflow as tf
import numpy as np
class TextCNN(object):
"""
A CNN for text classification.
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
<Parameters>
- sequence_length: 최대 문장 길이
- num_classes: 클래스 개수
- vocab_size: 등장 단어 수
- embedding_size: 각 단어에 해당되는 임베디드 벡터의 차원
- filter_sizes: convolutional filter들의 사이즈 (= 각 filter가 몇 개의 단어를 볼 것인가?) (예: "3, 4, 5")
- num_filters: 각 filter size 별 filter 수
- l2_reg_lambda: 각 weights, biases에 대한 l2 regularization 정도
"""
def __init__(
self, sequence_length, num_classes, embedding_size,
filter_sizes, num_filters, l2_reg_lambda=0.0):
self.input_x = tf.placeholder(tf.float32, [None, sequence_length, embedding_size], name="input_x")
input_x_expanded = tf.expand_dims(self.input_x, -1)
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
l2_loss = tf.constant(0.0)
# Create a convolution + maxpool layer for each filter size
self.h_outputs = []
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
pad_input = tf.pad(input_x_expanded, [[0, 0], [1, filter_size - 2], [0, 0], [0, 0]],
mode='CONSTANT')
Wc = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
bc = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
pad_input,
Wc,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv, bc), name="relu")
# Average pooling over the outputs
pooled = tf.nn.avg_pool(
h,
ksize=[1, sequence_length, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
self.h_outputs.append(h)
pooled_outputs.append(pooled)
# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(3, pooled_outputs)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
# Add dropout
with tf.name_scope("dropout"):
h_drop = tf.nn.dropout(h_pool_flat, self.dropout_keep_prob)
# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
self.finW = tf.get_variable(
"finW",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.constant(0.0, shape=[num_classes], name="b")
l2_loss += tf.nn.l2_loss(self.finW)
scores = tf.nn.xw_plus_b(h_drop, self.finW, b, name="scores")
self.predictions = tf.argmax(scores, 1, name="predictions")
# Calculate Mean cross-entropy loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(scores, self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
# Accuracy
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
# data loading
import cnn_movie_advanced_padding_tool_word2vec as tool
data_path = 'C:/Users/ratsgo/GoogleDrive/내폴더/textmining/data/eng_review_corpus.csv' # 영문
corpus, points = tool.loading_rdata(data_path, minlength=10, eng=True, num=False, punc=False)
#data_path = 'C:/Users/ratsgo/GoogleDrive/내폴더/textmining/data/watcha_movie_review_spacecorrected_noisedeleted.csv' # 한글
#corpus, points = tool.loading_rdata(data_path, minlength=10, eng=False, num=False, punc=False)
train_idx, _ = tool.get_train_idx(len(corpus),train_prop=1)
word2vec_path= 'C:/textmining/eng_moviereview_word2vec.pickle' # 영문
#word2vec_path= 'C:/textmining/moviereview_word2vec.pickle' # 한글
embedding_model=tool.load_word2vec(word2vec_path)
max_document_length = 30
num_classes = 2
'''
# 이 부분은 기존 입력값 만드는 부분
rawcontents = tool.cut(corpus, cut=3)
raw_x, vocabulary, vocab_size, vocab_processor = tool.make_raw_input(rawcontents,max_document_length)
print('사전단어수 : %s' % (vocab_size))
y = tool.make_output(points, threshold=2.5)
raw_x_train, raw_x_test, y_train, y_test, train_idx = tool.divide(raw_x,y,train_prop=0.9)
#word2vec_x_train, word2vec_x_test, y_train, y_test, train_idx = tool.divide(word2vec_x,y,train_prop=0.9)
'''
# Model Hyperparameters
flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of embedded vector (default: 128)")
flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
flags.DEFINE_float("l2_reg_lambda", 0.1, "L2 regularization lambda (default: 0.0)")
# Training parameters
flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
flags.DEFINE_integer("num_epochs", 3, "Number of training epochs (default: 200)")
flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# 3. train the model and test
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
cnn = TextCNN(sequence_length=max_document_length,
num_classes=num_classes,
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
'''
cnn = TextCNN(sequence_length=raw_x_train.shape[1],
num_classes=y_train.shape[1],
vocab_size=vocab_size,
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
'''
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Initialize all variables
sess.run(tf.global_variables_initializer())
def train_step(x_batch, y_batch):
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, loss, accuracy = sess.run(
[train_op, global_step, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
# Generate batches
batches = tool.get_batch(list(zip(corpus, points)), FLAGS.batch_size, FLAGS.num_epochs, train_idx,
word2vec=True, max_document_length=max_document_length, word2vec_model=embedding_model)
# Training loop
for batch in batches:
x_batch, y_batch = zip(*batch)
train_step(x_batch, y_batch)
current_step = tf.train.global_step(sess, global_step)
# act map * weights 저장
import collections
filter_sizes=list(map(int, FLAGS.filter_sizes.split(",")))
fin_weights = sess.run(cnn.finW)
batches = tool.get_batch(list(zip(corpus, points)), FLAGS.batch_size, 1, train_idx,
word2vec=True, max_document_length=max_document_length, word2vec_model=embedding_model)
results = []
doc_idx = 0
for num, batch in enumerate(batches):
if num % 100 == 0:
print("batch {}".format(num))
x_batch, y_batch = zip(*batch)
if len(x_batch) == FLAGS.batch_size:
actmaps, predictions = sess.run([cnn.h_outputs, cnn.predictions],
feed_dict={cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 1.0})
for batch_idx in range(FLAGS.batch_size):
combined_actmap = \
np.zeros((max_document_length, len(filter_sizes) * FLAGS.num_filters))
start = 0
end = FLAGS.num_filters
for actmap_idx in range(len(actmaps)):
combined_actmap[:, start:end] = \
actmaps[actmap_idx][batch_idx].reshape(
(max_document_length, FLAGS.num_filters))
start += FLAGS.num_filters
end += FLAGS.num_filters
batch_result = np.dot(combined_actmap, fin_weights)
batch_result = batch_result[:, predictions[batch_idx]]
fin_result = collections.OrderedDict()
text = corpus[train_idx[doc_idx]].split()
for word_idx, score in enumerate(batch_result):
if word_idx < len(text):
fin_result[text[word_idx]] = score
else:
fin_result[word_idx] = score
preinfo = {'실제값' : y_batch[batch_idx], '예측값' : predictions[batch_idx]}
results.append([preinfo,fin_result])
doc_idx += 1
# 상위 5개에 별표 치기
star_results = []
for result_idx, data in enumerate(results):
if result_idx % 10000 == 0:
print("data {}".format(result_idx))
tmp = sorted(data[1].items(), key=lambda x: x[1], reverse=True)[0:5] ##
tmp = [score_tuple[0] for score_tuple in tmp]
star_result = collections.OrderedDict()
for score_tuple in data[1].items():
star_result[score_tuple[0]] = ''
for num in range(len(tmp)):
star_result[tmp[num]] = '*' * (num + 1)
star_results.append([data[0],star_result])
# 상위 5개에 포함돼 있는 단어 사전 만들기
positive_dict = collections.defaultdict(int)
negative_dict = collections.defaultdict(int)
for result_idx, data in enumerate(results):
if result_idx % 10000 == 0:
print("data {}".format(result_idx))
tmp = sorted(data[1].items(), key=lambda x: x[1], reverse=True)[0:5] ##
tmp = [score_tuple[0] for score_tuple in tmp]
if data[0]['예측값'] == 0:
for num in range(len(tmp)):
if type(tmp[num]) == int:
continue
positive_dict[tmp[num]] += 1
else:
for num in range(len(tmp)):
if type(tmp[num]) == int:
continue
negative_dict[tmp[num]] += 1
positive_dict = sorted(positive_dict.items(), key=lambda x: x[1], reverse=True)
negative_dict = sorted(negative_dict.items(), key=lambda x: x[1], reverse=True)
# 결과 저장
import pickle
with open('result_epoch3_minleng10_maxleng30_cut3_word2vec_english.pickle', 'wb') as f:
pickle.dump(results,f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment